ipvs: optimize dst usage for real server
[firefly-linux-kernel-4.4.55.git] / net / netfilter / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
28 #include <linux/fs.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
35
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
39
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
42 #include <net/ip.h>
43 #ifdef CONFIG_IP_VS_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_route.h>
46 #endif
47 #include <net/route.h>
48 #include <net/sock.h>
49 #include <net/genetlink.h>
50
51 #include <asm/uaccess.h>
52
53 #include <net/ip_vs.h>
54
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
57
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
60
61 /* sysctl variables */
62
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
65
66 int ip_vs_get_debug_level(void)
67 {
68         return sysctl_ip_vs_debug_level;
69 }
70 #endif
71
72
73 /*  Protos */
74 static void __ip_vs_del_service(struct ip_vs_service *svc);
75
76
77 #ifdef CONFIG_IP_VS_IPV6
78 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
79 static bool __ip_vs_addr_is_local_v6(struct net *net,
80                                      const struct in6_addr *addr)
81 {
82         struct flowi6 fl6 = {
83                 .daddr = *addr,
84         };
85         struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
86         bool is_local;
87
88         is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
89
90         dst_release(dst);
91         return is_local;
92 }
93 #endif
94
95 #ifdef CONFIG_SYSCTL
96 /*
97  *      update_defense_level is called from keventd and from sysctl,
98  *      so it needs to protect itself from softirqs
99  */
100 static void update_defense_level(struct netns_ipvs *ipvs)
101 {
102         struct sysinfo i;
103         static int old_secure_tcp = 0;
104         int availmem;
105         int nomem;
106         int to_change = -1;
107
108         /* we only count free and buffered memory (in pages) */
109         si_meminfo(&i);
110         availmem = i.freeram + i.bufferram;
111         /* however in linux 2.5 the i.bufferram is total page cache size,
112            we need adjust it */
113         /* si_swapinfo(&i); */
114         /* availmem = availmem - (i.totalswap - i.freeswap); */
115
116         nomem = (availmem < ipvs->sysctl_amemthresh);
117
118         local_bh_disable();
119
120         /* drop_entry */
121         spin_lock(&ipvs->dropentry_lock);
122         switch (ipvs->sysctl_drop_entry) {
123         case 0:
124                 atomic_set(&ipvs->dropentry, 0);
125                 break;
126         case 1:
127                 if (nomem) {
128                         atomic_set(&ipvs->dropentry, 1);
129                         ipvs->sysctl_drop_entry = 2;
130                 } else {
131                         atomic_set(&ipvs->dropentry, 0);
132                 }
133                 break;
134         case 2:
135                 if (nomem) {
136                         atomic_set(&ipvs->dropentry, 1);
137                 } else {
138                         atomic_set(&ipvs->dropentry, 0);
139                         ipvs->sysctl_drop_entry = 1;
140                 };
141                 break;
142         case 3:
143                 atomic_set(&ipvs->dropentry, 1);
144                 break;
145         }
146         spin_unlock(&ipvs->dropentry_lock);
147
148         /* drop_packet */
149         spin_lock(&ipvs->droppacket_lock);
150         switch (ipvs->sysctl_drop_packet) {
151         case 0:
152                 ipvs->drop_rate = 0;
153                 break;
154         case 1:
155                 if (nomem) {
156                         ipvs->drop_rate = ipvs->drop_counter
157                                 = ipvs->sysctl_amemthresh /
158                                 (ipvs->sysctl_amemthresh-availmem);
159                         ipvs->sysctl_drop_packet = 2;
160                 } else {
161                         ipvs->drop_rate = 0;
162                 }
163                 break;
164         case 2:
165                 if (nomem) {
166                         ipvs->drop_rate = ipvs->drop_counter
167                                 = ipvs->sysctl_amemthresh /
168                                 (ipvs->sysctl_amemthresh-availmem);
169                 } else {
170                         ipvs->drop_rate = 0;
171                         ipvs->sysctl_drop_packet = 1;
172                 }
173                 break;
174         case 3:
175                 ipvs->drop_rate = ipvs->sysctl_am_droprate;
176                 break;
177         }
178         spin_unlock(&ipvs->droppacket_lock);
179
180         /* secure_tcp */
181         spin_lock(&ipvs->securetcp_lock);
182         switch (ipvs->sysctl_secure_tcp) {
183         case 0:
184                 if (old_secure_tcp >= 2)
185                         to_change = 0;
186                 break;
187         case 1:
188                 if (nomem) {
189                         if (old_secure_tcp < 2)
190                                 to_change = 1;
191                         ipvs->sysctl_secure_tcp = 2;
192                 } else {
193                         if (old_secure_tcp >= 2)
194                                 to_change = 0;
195                 }
196                 break;
197         case 2:
198                 if (nomem) {
199                         if (old_secure_tcp < 2)
200                                 to_change = 1;
201                 } else {
202                         if (old_secure_tcp >= 2)
203                                 to_change = 0;
204                         ipvs->sysctl_secure_tcp = 1;
205                 }
206                 break;
207         case 3:
208                 if (old_secure_tcp < 2)
209                         to_change = 1;
210                 break;
211         }
212         old_secure_tcp = ipvs->sysctl_secure_tcp;
213         if (to_change >= 0)
214                 ip_vs_protocol_timeout_change(ipvs,
215                                               ipvs->sysctl_secure_tcp > 1);
216         spin_unlock(&ipvs->securetcp_lock);
217
218         local_bh_enable();
219 }
220
221
222 /*
223  *      Timer for checking the defense
224  */
225 #define DEFENSE_TIMER_PERIOD    1*HZ
226
227 static void defense_work_handler(struct work_struct *work)
228 {
229         struct netns_ipvs *ipvs =
230                 container_of(work, struct netns_ipvs, defense_work.work);
231
232         update_defense_level(ipvs);
233         if (atomic_read(&ipvs->dropentry))
234                 ip_vs_random_dropentry(ipvs->net);
235         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
236 }
237 #endif
238
239 int
240 ip_vs_use_count_inc(void)
241 {
242         return try_module_get(THIS_MODULE);
243 }
244
245 void
246 ip_vs_use_count_dec(void)
247 {
248         module_put(THIS_MODULE);
249 }
250
251
252 /*
253  *      Hash table: for virtual service lookups
254  */
255 #define IP_VS_SVC_TAB_BITS 8
256 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
257 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
258
259 /* the service table hashed by <protocol, addr, port> */
260 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
261 /* the service table hashed by fwmark */
262 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
263
264
265 /*
266  *      Returns hash value for virtual service
267  */
268 static inline unsigned int
269 ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
270                   const union nf_inet_addr *addr, __be16 port)
271 {
272         register unsigned int porth = ntohs(port);
273         __be32 addr_fold = addr->ip;
274         __u32 ahash;
275
276 #ifdef CONFIG_IP_VS_IPV6
277         if (af == AF_INET6)
278                 addr_fold = addr->ip6[0]^addr->ip6[1]^
279                             addr->ip6[2]^addr->ip6[3];
280 #endif
281         ahash = ntohl(addr_fold);
282         ahash ^= ((size_t) net >> 8);
283
284         return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
285                IP_VS_SVC_TAB_MASK;
286 }
287
288 /*
289  *      Returns hash value of fwmark for virtual service lookup
290  */
291 static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
292 {
293         return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
294 }
295
296 /*
297  *      Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
298  *      or in the ip_vs_svc_fwm_table by fwmark.
299  *      Should be called with locked tables.
300  */
301 static int ip_vs_svc_hash(struct ip_vs_service *svc)
302 {
303         unsigned int hash;
304
305         if (svc->flags & IP_VS_SVC_F_HASHED) {
306                 pr_err("%s(): request for already hashed, called from %pF\n",
307                        __func__, __builtin_return_address(0));
308                 return 0;
309         }
310
311         if (svc->fwmark == 0) {
312                 /*
313                  *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
314                  */
315                 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
316                                          &svc->addr, svc->port);
317                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
318         } else {
319                 /*
320                  *  Hash it by fwmark in svc_fwm_table
321                  */
322                 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
323                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
324         }
325
326         svc->flags |= IP_VS_SVC_F_HASHED;
327         /* increase its refcnt because it is referenced by the svc table */
328         atomic_inc(&svc->refcnt);
329         return 1;
330 }
331
332
333 /*
334  *      Unhashes a service from svc_table / svc_fwm_table.
335  *      Should be called with locked tables.
336  */
337 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
338 {
339         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
340                 pr_err("%s(): request for unhash flagged, called from %pF\n",
341                        __func__, __builtin_return_address(0));
342                 return 0;
343         }
344
345         if (svc->fwmark == 0) {
346                 /* Remove it from the svc_table table */
347                 list_del(&svc->s_list);
348         } else {
349                 /* Remove it from the svc_fwm_table table */
350                 list_del(&svc->f_list);
351         }
352
353         svc->flags &= ~IP_VS_SVC_F_HASHED;
354         atomic_dec(&svc->refcnt);
355         return 1;
356 }
357
358
359 /*
360  *      Get service by {netns, proto,addr,port} in the service table.
361  */
362 static inline struct ip_vs_service *
363 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
364                      const union nf_inet_addr *vaddr, __be16 vport)
365 {
366         unsigned int hash;
367         struct ip_vs_service *svc;
368
369         /* Check for "full" addressed entries */
370         hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
371
372         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
373                 if ((svc->af == af)
374                     && ip_vs_addr_equal(af, &svc->addr, vaddr)
375                     && (svc->port == vport)
376                     && (svc->protocol == protocol)
377                     && net_eq(svc->net, net)) {
378                         /* HIT */
379                         return svc;
380                 }
381         }
382
383         return NULL;
384 }
385
386
387 /*
388  *      Get service by {fwmark} in the service table.
389  */
390 static inline struct ip_vs_service *
391 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
392 {
393         unsigned int hash;
394         struct ip_vs_service *svc;
395
396         /* Check for fwmark addressed entries */
397         hash = ip_vs_svc_fwm_hashkey(net, fwmark);
398
399         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400                 if (svc->fwmark == fwmark && svc->af == af
401                     && net_eq(svc->net, net)) {
402                         /* HIT */
403                         return svc;
404                 }
405         }
406
407         return NULL;
408 }
409
410 struct ip_vs_service *
411 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
412                   const union nf_inet_addr *vaddr, __be16 vport)
413 {
414         struct ip_vs_service *svc;
415         struct netns_ipvs *ipvs = net_ipvs(net);
416
417         read_lock(&__ip_vs_svc_lock);
418
419         /*
420          *      Check the table hashed by fwmark first
421          */
422         if (fwmark) {
423                 svc = __ip_vs_svc_fwm_find(net, af, fwmark);
424                 if (svc)
425                         goto out;
426         }
427
428         /*
429          *      Check the table hashed by <protocol,addr,port>
430          *      for "full" addressed entries
431          */
432         svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
433
434         if (svc == NULL
435             && protocol == IPPROTO_TCP
436             && atomic_read(&ipvs->ftpsvc_counter)
437             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
438                 /*
439                  * Check if ftp service entry exists, the packet
440                  * might belong to FTP data connections.
441                  */
442                 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
443         }
444
445         if (svc == NULL
446             && atomic_read(&ipvs->nullsvc_counter)) {
447                 /*
448                  * Check if the catch-all port (port zero) exists
449                  */
450                 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
451         }
452
453   out:
454         if (svc)
455                 atomic_inc(&svc->usecnt);
456         read_unlock(&__ip_vs_svc_lock);
457
458         IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
459                       fwmark, ip_vs_proto_name(protocol),
460                       IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
461                       svc ? "hit" : "not hit");
462
463         return svc;
464 }
465
466
467 static inline void
468 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
469 {
470         atomic_inc(&svc->refcnt);
471         dest->svc = svc;
472 }
473
474 static void
475 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
476 {
477         struct ip_vs_service *svc = dest->svc;
478
479         dest->svc = NULL;
480         if (atomic_dec_and_test(&svc->refcnt)) {
481                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
482                               svc->fwmark,
483                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
484                               ntohs(svc->port), atomic_read(&svc->usecnt));
485                 free_percpu(svc->stats.cpustats);
486                 kfree(svc);
487         }
488 }
489
490
491 /*
492  *      Returns hash value for real service
493  */
494 static inline unsigned int ip_vs_rs_hashkey(int af,
495                                             const union nf_inet_addr *addr,
496                                             __be16 port)
497 {
498         register unsigned int porth = ntohs(port);
499         __be32 addr_fold = addr->ip;
500
501 #ifdef CONFIG_IP_VS_IPV6
502         if (af == AF_INET6)
503                 addr_fold = addr->ip6[0]^addr->ip6[1]^
504                             addr->ip6[2]^addr->ip6[3];
505 #endif
506
507         return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
508                 & IP_VS_RTAB_MASK;
509 }
510
511 /*
512  *      Hashes ip_vs_dest in rs_table by <proto,addr,port>.
513  *      should be called with locked tables.
514  */
515 static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
516 {
517         unsigned int hash;
518
519         if (!list_empty(&dest->d_list)) {
520                 return 0;
521         }
522
523         /*
524          *      Hash by proto,addr,port,
525          *      which are the parameters of the real service.
526          */
527         hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
528
529         list_add(&dest->d_list, &ipvs->rs_table[hash]);
530
531         return 1;
532 }
533
534 /*
535  *      UNhashes ip_vs_dest from rs_table.
536  *      should be called with locked tables.
537  */
538 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
539 {
540         /*
541          * Remove it from the rs_table table.
542          */
543         if (!list_empty(&dest->d_list)) {
544                 list_del_init(&dest->d_list);
545         }
546
547         return 1;
548 }
549
550 /*
551  *      Lookup real service by <proto,addr,port> in the real service table.
552  */
553 struct ip_vs_dest *
554 ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
555                           const union nf_inet_addr *daddr,
556                           __be16 dport)
557 {
558         struct netns_ipvs *ipvs = net_ipvs(net);
559         unsigned int hash;
560         struct ip_vs_dest *dest;
561
562         /*
563          *      Check for "full" addressed entries
564          *      Return the first found entry
565          */
566         hash = ip_vs_rs_hashkey(af, daddr, dport);
567
568         read_lock(&ipvs->rs_lock);
569         list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
570                 if ((dest->af == af)
571                     && ip_vs_addr_equal(af, &dest->addr, daddr)
572                     && (dest->port == dport)
573                     && ((dest->protocol == protocol) ||
574                         dest->vfwmark)) {
575                         /* HIT */
576                         read_unlock(&ipvs->rs_lock);
577                         return dest;
578                 }
579         }
580         read_unlock(&ipvs->rs_lock);
581
582         return NULL;
583 }
584
585 /*
586  *      Lookup destination by {addr,port} in the given service
587  */
588 static struct ip_vs_dest *
589 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
590                   __be16 dport)
591 {
592         struct ip_vs_dest *dest;
593
594         /*
595          * Find the destination for the given service
596          */
597         list_for_each_entry(dest, &svc->destinations, n_list) {
598                 if ((dest->af == svc->af)
599                     && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
600                     && (dest->port == dport)) {
601                         /* HIT */
602                         return dest;
603                 }
604         }
605
606         return NULL;
607 }
608
609 /*
610  * Find destination by {daddr,dport,vaddr,protocol}
611  * Cretaed to be used in ip_vs_process_message() in
612  * the backup synchronization daemon. It finds the
613  * destination to be bound to the received connection
614  * on the backup.
615  *
616  * ip_vs_lookup_real_service() looked promissing, but
617  * seems not working as expected.
618  */
619 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
620                                    const union nf_inet_addr *daddr,
621                                    __be16 dport,
622                                    const union nf_inet_addr *vaddr,
623                                    __be16 vport, __u16 protocol, __u32 fwmark,
624                                    __u32 flags)
625 {
626         struct ip_vs_dest *dest;
627         struct ip_vs_service *svc;
628         __be16 port = dport;
629
630         svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
631         if (!svc)
632                 return NULL;
633         if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
634                 port = 0;
635         dest = ip_vs_lookup_dest(svc, daddr, port);
636         if (!dest)
637                 dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
638         if (dest)
639                 atomic_inc(&dest->refcnt);
640         ip_vs_service_put(svc);
641         return dest;
642 }
643
644 void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
645 {
646         struct ip_vs_dest_dst *dest_dst = container_of(head,
647                                                        struct ip_vs_dest_dst,
648                                                        rcu_head);
649
650         dst_release(dest_dst->dst_cache);
651         kfree(dest_dst);
652 }
653
654 /* Release dest_dst and dst_cache for dest in user context */
655 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
656 {
657         struct ip_vs_dest_dst *old;
658
659         old = rcu_dereference_protected(dest->dest_dst, 1);
660         if (old) {
661                 RCU_INIT_POINTER(dest->dest_dst, NULL);
662                 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
663         }
664 }
665
666 /*
667  *  Lookup dest by {svc,addr,port} in the destination trash.
668  *  The destination trash is used to hold the destinations that are removed
669  *  from the service table but are still referenced by some conn entries.
670  *  The reason to add the destination trash is when the dest is temporary
671  *  down (either by administrator or by monitor program), the dest can be
672  *  picked back from the trash, the remaining connections to the dest can
673  *  continue, and the counting information of the dest is also useful for
674  *  scheduling.
675  */
676 static struct ip_vs_dest *
677 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
678                      __be16 dport)
679 {
680         struct ip_vs_dest *dest, *nxt;
681         struct netns_ipvs *ipvs = net_ipvs(svc->net);
682
683         /*
684          * Find the destination in trash
685          */
686         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
687                 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
688                               "dest->refcnt=%d\n",
689                               dest->vfwmark,
690                               IP_VS_DBG_ADDR(svc->af, &dest->addr),
691                               ntohs(dest->port),
692                               atomic_read(&dest->refcnt));
693                 if (dest->af == svc->af &&
694                     ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
695                     dest->port == dport &&
696                     dest->vfwmark == svc->fwmark &&
697                     dest->protocol == svc->protocol &&
698                     (svc->fwmark ||
699                      (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
700                       dest->vport == svc->port))) {
701                         /* HIT */
702                         return dest;
703                 }
704
705                 /*
706                  * Try to purge the destination from trash if not referenced
707                  */
708                 if (atomic_read(&dest->refcnt) == 1) {
709                         IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
710                                       "from trash\n",
711                                       dest->vfwmark,
712                                       IP_VS_DBG_ADDR(svc->af, &dest->addr),
713                                       ntohs(dest->port));
714                         list_del(&dest->n_list);
715                         __ip_vs_dst_cache_reset(dest);
716                         __ip_vs_unbind_svc(dest);
717                         free_percpu(dest->stats.cpustats);
718                         kfree(dest);
719                 }
720         }
721
722         return NULL;
723 }
724
725
726 /*
727  *  Clean up all the destinations in the trash
728  *  Called by the ip_vs_control_cleanup()
729  *
730  *  When the ip_vs_control_clearup is activated by ipvs module exit,
731  *  the service tables must have been flushed and all the connections
732  *  are expired, and the refcnt of each destination in the trash must
733  *  be 1, so we simply release them here.
734  */
735 static void ip_vs_trash_cleanup(struct net *net)
736 {
737         struct ip_vs_dest *dest, *nxt;
738         struct netns_ipvs *ipvs = net_ipvs(net);
739
740         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
741                 list_del(&dest->n_list);
742                 __ip_vs_dst_cache_reset(dest);
743                 __ip_vs_unbind_svc(dest);
744                 free_percpu(dest->stats.cpustats);
745                 kfree(dest);
746         }
747 }
748
749 static void
750 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
751 {
752 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
753
754         spin_lock_bh(&src->lock);
755
756         IP_VS_SHOW_STATS_COUNTER(conns);
757         IP_VS_SHOW_STATS_COUNTER(inpkts);
758         IP_VS_SHOW_STATS_COUNTER(outpkts);
759         IP_VS_SHOW_STATS_COUNTER(inbytes);
760         IP_VS_SHOW_STATS_COUNTER(outbytes);
761
762         ip_vs_read_estimator(dst, src);
763
764         spin_unlock_bh(&src->lock);
765 }
766
767 static void
768 ip_vs_zero_stats(struct ip_vs_stats *stats)
769 {
770         spin_lock_bh(&stats->lock);
771
772         /* get current counters as zero point, rates are zeroed */
773
774 #define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
775
776         IP_VS_ZERO_STATS_COUNTER(conns);
777         IP_VS_ZERO_STATS_COUNTER(inpkts);
778         IP_VS_ZERO_STATS_COUNTER(outpkts);
779         IP_VS_ZERO_STATS_COUNTER(inbytes);
780         IP_VS_ZERO_STATS_COUNTER(outbytes);
781
782         ip_vs_zero_estimator(stats);
783
784         spin_unlock_bh(&stats->lock);
785 }
786
787 /*
788  *      Update a destination in the given service
789  */
790 static void
791 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
792                     struct ip_vs_dest_user_kern *udest, int add)
793 {
794         struct netns_ipvs *ipvs = net_ipvs(svc->net);
795         int conn_flags;
796
797         /* set the weight and the flags */
798         atomic_set(&dest->weight, udest->weight);
799         conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
800         conn_flags |= IP_VS_CONN_F_INACTIVE;
801
802         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
803         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
804                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
805         } else {
806                 /*
807                  *    Put the real service in rs_table if not present.
808                  *    For now only for NAT!
809                  */
810                 write_lock_bh(&ipvs->rs_lock);
811                 ip_vs_rs_hash(ipvs, dest);
812                 write_unlock_bh(&ipvs->rs_lock);
813         }
814         atomic_set(&dest->conn_flags, conn_flags);
815
816         /* bind the service */
817         if (!dest->svc) {
818                 __ip_vs_bind_svc(dest, svc);
819         } else {
820                 if (dest->svc != svc) {
821                         __ip_vs_unbind_svc(dest);
822                         ip_vs_zero_stats(&dest->stats);
823                         __ip_vs_bind_svc(dest, svc);
824                 }
825         }
826
827         /* set the dest status flags */
828         dest->flags |= IP_VS_DEST_F_AVAILABLE;
829
830         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
831                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
832         dest->u_threshold = udest->u_threshold;
833         dest->l_threshold = udest->l_threshold;
834
835         spin_lock_bh(&dest->dst_lock);
836         __ip_vs_dst_cache_reset(dest);
837         spin_unlock_bh(&dest->dst_lock);
838
839         if (add)
840                 ip_vs_start_estimator(svc->net, &dest->stats);
841
842         write_lock_bh(&__ip_vs_svc_lock);
843
844         /* Wait until all other svc users go away */
845         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
846
847         if (add) {
848                 list_add(&dest->n_list, &svc->destinations);
849                 svc->num_dests++;
850         }
851
852         /* call the update_service, because server weight may be changed */
853         if (svc->scheduler->update_service)
854                 svc->scheduler->update_service(svc);
855
856         write_unlock_bh(&__ip_vs_svc_lock);
857 }
858
859
860 /*
861  *      Create a destination for the given service
862  */
863 static int
864 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
865                struct ip_vs_dest **dest_p)
866 {
867         struct ip_vs_dest *dest;
868         unsigned int atype;
869
870         EnterFunction(2);
871
872 #ifdef CONFIG_IP_VS_IPV6
873         if (svc->af == AF_INET6) {
874                 atype = ipv6_addr_type(&udest->addr.in6);
875                 if ((!(atype & IPV6_ADDR_UNICAST) ||
876                         atype & IPV6_ADDR_LINKLOCAL) &&
877                         !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
878                         return -EINVAL;
879         } else
880 #endif
881         {
882                 atype = inet_addr_type(svc->net, udest->addr.ip);
883                 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
884                         return -EINVAL;
885         }
886
887         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
888         if (dest == NULL)
889                 return -ENOMEM;
890
891         dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
892         if (!dest->stats.cpustats)
893                 goto err_alloc;
894
895         dest->af = svc->af;
896         dest->protocol = svc->protocol;
897         dest->vaddr = svc->addr;
898         dest->vport = svc->port;
899         dest->vfwmark = svc->fwmark;
900         ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
901         dest->port = udest->port;
902
903         atomic_set(&dest->activeconns, 0);
904         atomic_set(&dest->inactconns, 0);
905         atomic_set(&dest->persistconns, 0);
906         atomic_set(&dest->refcnt, 1);
907
908         INIT_LIST_HEAD(&dest->d_list);
909         spin_lock_init(&dest->dst_lock);
910         spin_lock_init(&dest->stats.lock);
911         __ip_vs_update_dest(svc, dest, udest, 1);
912
913         *dest_p = dest;
914
915         LeaveFunction(2);
916         return 0;
917
918 err_alloc:
919         kfree(dest);
920         return -ENOMEM;
921 }
922
923
924 /*
925  *      Add a destination into an existing service
926  */
927 static int
928 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
929 {
930         struct ip_vs_dest *dest;
931         union nf_inet_addr daddr;
932         __be16 dport = udest->port;
933         int ret;
934
935         EnterFunction(2);
936
937         if (udest->weight < 0) {
938                 pr_err("%s(): server weight less than zero\n", __func__);
939                 return -ERANGE;
940         }
941
942         if (udest->l_threshold > udest->u_threshold) {
943                 pr_err("%s(): lower threshold is higher than upper threshold\n",
944                         __func__);
945                 return -ERANGE;
946         }
947
948         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
949
950         /*
951          * Check if the dest already exists in the list
952          */
953         dest = ip_vs_lookup_dest(svc, &daddr, dport);
954
955         if (dest != NULL) {
956                 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
957                 return -EEXIST;
958         }
959
960         /*
961          * Check if the dest already exists in the trash and
962          * is from the same service
963          */
964         dest = ip_vs_trash_get_dest(svc, &daddr, dport);
965
966         if (dest != NULL) {
967                 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
968                               "dest->refcnt=%d, service %u/%s:%u\n",
969                               IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
970                               atomic_read(&dest->refcnt),
971                               dest->vfwmark,
972                               IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
973                               ntohs(dest->vport));
974
975                 /*
976                  * Get the destination from the trash
977                  */
978                 list_del(&dest->n_list);
979
980                 __ip_vs_update_dest(svc, dest, udest, 1);
981                 ret = 0;
982         } else {
983                 /*
984                  * Allocate and initialize the dest structure
985                  */
986                 ret = ip_vs_new_dest(svc, udest, &dest);
987         }
988         LeaveFunction(2);
989
990         return ret;
991 }
992
993
994 /*
995  *      Edit a destination in the given service
996  */
997 static int
998 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
999 {
1000         struct ip_vs_dest *dest;
1001         union nf_inet_addr daddr;
1002         __be16 dport = udest->port;
1003
1004         EnterFunction(2);
1005
1006         if (udest->weight < 0) {
1007                 pr_err("%s(): server weight less than zero\n", __func__);
1008                 return -ERANGE;
1009         }
1010
1011         if (udest->l_threshold > udest->u_threshold) {
1012                 pr_err("%s(): lower threshold is higher than upper threshold\n",
1013                         __func__);
1014                 return -ERANGE;
1015         }
1016
1017         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
1018
1019         /*
1020          *  Lookup the destination list
1021          */
1022         dest = ip_vs_lookup_dest(svc, &daddr, dport);
1023
1024         if (dest == NULL) {
1025                 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1026                 return -ENOENT;
1027         }
1028
1029         __ip_vs_update_dest(svc, dest, udest, 0);
1030         LeaveFunction(2);
1031
1032         return 0;
1033 }
1034
1035
1036 /*
1037  *      Delete a destination (must be already unlinked from the service)
1038  */
1039 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
1040 {
1041         struct netns_ipvs *ipvs = net_ipvs(net);
1042
1043         ip_vs_stop_estimator(net, &dest->stats);
1044
1045         /*
1046          *  Remove it from the d-linked list with the real services.
1047          */
1048         write_lock_bh(&ipvs->rs_lock);
1049         ip_vs_rs_unhash(dest);
1050         write_unlock_bh(&ipvs->rs_lock);
1051
1052         /*
1053          *  Decrease the refcnt of the dest, and free the dest
1054          *  if nobody refers to it (refcnt=0). Otherwise, throw
1055          *  the destination into the trash.
1056          */
1057         if (atomic_dec_and_test(&dest->refcnt)) {
1058                 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
1059                               dest->vfwmark,
1060                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1061                               ntohs(dest->port));
1062                 __ip_vs_dst_cache_reset(dest);
1063                 /* simply decrease svc->refcnt here, let the caller check
1064                    and release the service if nobody refers to it.
1065                    Only user context can release destination and service,
1066                    and only one user context can update virtual service at a
1067                    time, so the operation here is OK */
1068                 atomic_dec(&dest->svc->refcnt);
1069                 free_percpu(dest->stats.cpustats);
1070                 kfree(dest);
1071         } else {
1072                 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1073                               "dest->refcnt=%d\n",
1074                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1075                               ntohs(dest->port),
1076                               atomic_read(&dest->refcnt));
1077                 list_add(&dest->n_list, &ipvs->dest_trash);
1078                 atomic_inc(&dest->refcnt);
1079         }
1080 }
1081
1082
1083 /*
1084  *      Unlink a destination from the given service
1085  */
1086 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1087                                 struct ip_vs_dest *dest,
1088                                 int svcupd)
1089 {
1090         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1091
1092         /*
1093          *  Remove it from the d-linked destination list.
1094          */
1095         list_del(&dest->n_list);
1096         svc->num_dests--;
1097
1098         /*
1099          *  Call the update_service function of its scheduler
1100          */
1101         if (svcupd && svc->scheduler->update_service)
1102                         svc->scheduler->update_service(svc);
1103 }
1104
1105
1106 /*
1107  *      Delete a destination server in the given service
1108  */
1109 static int
1110 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1111 {
1112         struct ip_vs_dest *dest;
1113         __be16 dport = udest->port;
1114
1115         EnterFunction(2);
1116
1117         dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1118
1119         if (dest == NULL) {
1120                 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1121                 return -ENOENT;
1122         }
1123
1124         write_lock_bh(&__ip_vs_svc_lock);
1125
1126         /*
1127          *      Wait until all other svc users go away.
1128          */
1129         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1130
1131         /*
1132          *      Unlink dest from the service
1133          */
1134         __ip_vs_unlink_dest(svc, dest, 1);
1135
1136         write_unlock_bh(&__ip_vs_svc_lock);
1137
1138         /*
1139          *      Delete the destination
1140          */
1141         __ip_vs_del_dest(svc->net, dest);
1142
1143         LeaveFunction(2);
1144
1145         return 0;
1146 }
1147
1148
1149 /*
1150  *      Add a service into the service hash table
1151  */
1152 static int
1153 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1154                   struct ip_vs_service **svc_p)
1155 {
1156         int ret = 0;
1157         struct ip_vs_scheduler *sched = NULL;
1158         struct ip_vs_pe *pe = NULL;
1159         struct ip_vs_service *svc = NULL;
1160         struct netns_ipvs *ipvs = net_ipvs(net);
1161
1162         /* increase the module use count */
1163         ip_vs_use_count_inc();
1164
1165         /* Lookup the scheduler by 'u->sched_name' */
1166         sched = ip_vs_scheduler_get(u->sched_name);
1167         if (sched == NULL) {
1168                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1169                 ret = -ENOENT;
1170                 goto out_err;
1171         }
1172
1173         if (u->pe_name && *u->pe_name) {
1174                 pe = ip_vs_pe_getbyname(u->pe_name);
1175                 if (pe == NULL) {
1176                         pr_info("persistence engine module ip_vs_pe_%s "
1177                                 "not found\n", u->pe_name);
1178                         ret = -ENOENT;
1179                         goto out_err;
1180                 }
1181         }
1182
1183 #ifdef CONFIG_IP_VS_IPV6
1184         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1185                 ret = -EINVAL;
1186                 goto out_err;
1187         }
1188 #endif
1189
1190         svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1191         if (svc == NULL) {
1192                 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1193                 ret = -ENOMEM;
1194                 goto out_err;
1195         }
1196         svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1197         if (!svc->stats.cpustats) {
1198                 ret = -ENOMEM;
1199                 goto out_err;
1200         }
1201
1202         /* I'm the first user of the service */
1203         atomic_set(&svc->usecnt, 0);
1204         atomic_set(&svc->refcnt, 0);
1205
1206         svc->af = u->af;
1207         svc->protocol = u->protocol;
1208         ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1209         svc->port = u->port;
1210         svc->fwmark = u->fwmark;
1211         svc->flags = u->flags;
1212         svc->timeout = u->timeout * HZ;
1213         svc->netmask = u->netmask;
1214         svc->net = net;
1215
1216         INIT_LIST_HEAD(&svc->destinations);
1217         rwlock_init(&svc->sched_lock);
1218         spin_lock_init(&svc->stats.lock);
1219
1220         /* Bind the scheduler */
1221         ret = ip_vs_bind_scheduler(svc, sched);
1222         if (ret)
1223                 goto out_err;
1224         sched = NULL;
1225
1226         /* Bind the ct retriever */
1227         ip_vs_bind_pe(svc, pe);
1228         pe = NULL;
1229
1230         /* Update the virtual service counters */
1231         if (svc->port == FTPPORT)
1232                 atomic_inc(&ipvs->ftpsvc_counter);
1233         else if (svc->port == 0)
1234                 atomic_inc(&ipvs->nullsvc_counter);
1235
1236         ip_vs_start_estimator(net, &svc->stats);
1237
1238         /* Count only IPv4 services for old get/setsockopt interface */
1239         if (svc->af == AF_INET)
1240                 ipvs->num_services++;
1241
1242         /* Hash the service into the service table */
1243         write_lock_bh(&__ip_vs_svc_lock);
1244         ip_vs_svc_hash(svc);
1245         write_unlock_bh(&__ip_vs_svc_lock);
1246
1247         *svc_p = svc;
1248         /* Now there is a service - full throttle */
1249         ipvs->enable = 1;
1250         return 0;
1251
1252
1253  out_err:
1254         if (svc != NULL) {
1255                 ip_vs_unbind_scheduler(svc);
1256                 if (svc->inc) {
1257                         local_bh_disable();
1258                         ip_vs_app_inc_put(svc->inc);
1259                         local_bh_enable();
1260                 }
1261                 if (svc->stats.cpustats)
1262                         free_percpu(svc->stats.cpustats);
1263                 kfree(svc);
1264         }
1265         ip_vs_scheduler_put(sched);
1266         ip_vs_pe_put(pe);
1267
1268         /* decrease the module use count */
1269         ip_vs_use_count_dec();
1270
1271         return ret;
1272 }
1273
1274
1275 /*
1276  *      Edit a service and bind it with a new scheduler
1277  */
1278 static int
1279 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1280 {
1281         struct ip_vs_scheduler *sched, *old_sched;
1282         struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1283         int ret = 0;
1284
1285         /*
1286          * Lookup the scheduler, by 'u->sched_name'
1287          */
1288         sched = ip_vs_scheduler_get(u->sched_name);
1289         if (sched == NULL) {
1290                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1291                 return -ENOENT;
1292         }
1293         old_sched = sched;
1294
1295         if (u->pe_name && *u->pe_name) {
1296                 pe = ip_vs_pe_getbyname(u->pe_name);
1297                 if (pe == NULL) {
1298                         pr_info("persistence engine module ip_vs_pe_%s "
1299                                 "not found\n", u->pe_name);
1300                         ret = -ENOENT;
1301                         goto out;
1302                 }
1303                 old_pe = pe;
1304         }
1305
1306 #ifdef CONFIG_IP_VS_IPV6
1307         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1308                 ret = -EINVAL;
1309                 goto out;
1310         }
1311 #endif
1312
1313         write_lock_bh(&__ip_vs_svc_lock);
1314
1315         /*
1316          * Wait until all other svc users go away.
1317          */
1318         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1319
1320         /*
1321          * Set the flags and timeout value
1322          */
1323         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1324         svc->timeout = u->timeout * HZ;
1325         svc->netmask = u->netmask;
1326
1327         old_sched = svc->scheduler;
1328         if (sched != old_sched) {
1329                 /*
1330                  * Unbind the old scheduler
1331                  */
1332                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1333                         old_sched = sched;
1334                         goto out_unlock;
1335                 }
1336
1337                 /*
1338                  * Bind the new scheduler
1339                  */
1340                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1341                         /*
1342                          * If ip_vs_bind_scheduler fails, restore the old
1343                          * scheduler.
1344                          * The main reason of failure is out of memory.
1345                          *
1346                          * The question is if the old scheduler can be
1347                          * restored all the time. TODO: if it cannot be
1348                          * restored some time, we must delete the service,
1349                          * otherwise the system may crash.
1350                          */
1351                         ip_vs_bind_scheduler(svc, old_sched);
1352                         old_sched = sched;
1353                         goto out_unlock;
1354                 }
1355         }
1356
1357         old_pe = svc->pe;
1358         if (pe != old_pe) {
1359                 ip_vs_unbind_pe(svc);
1360                 ip_vs_bind_pe(svc, pe);
1361         }
1362
1363 out_unlock:
1364         write_unlock_bh(&__ip_vs_svc_lock);
1365 out:
1366         ip_vs_scheduler_put(old_sched);
1367         ip_vs_pe_put(old_pe);
1368         return ret;
1369 }
1370
1371
1372 /*
1373  *      Delete a service from the service list
1374  *      - The service must be unlinked, unlocked and not referenced!
1375  *      - We are called under _bh lock
1376  */
1377 static void __ip_vs_del_service(struct ip_vs_service *svc)
1378 {
1379         struct ip_vs_dest *dest, *nxt;
1380         struct ip_vs_scheduler *old_sched;
1381         struct ip_vs_pe *old_pe;
1382         struct netns_ipvs *ipvs = net_ipvs(svc->net);
1383
1384         pr_info("%s: enter\n", __func__);
1385
1386         /* Count only IPv4 services for old get/setsockopt interface */
1387         if (svc->af == AF_INET)
1388                 ipvs->num_services--;
1389
1390         ip_vs_stop_estimator(svc->net, &svc->stats);
1391
1392         /* Unbind scheduler */
1393         old_sched = svc->scheduler;
1394         ip_vs_unbind_scheduler(svc);
1395         ip_vs_scheduler_put(old_sched);
1396
1397         /* Unbind persistence engine */
1398         old_pe = svc->pe;
1399         ip_vs_unbind_pe(svc);
1400         ip_vs_pe_put(old_pe);
1401
1402         /* Unbind app inc */
1403         if (svc->inc) {
1404                 ip_vs_app_inc_put(svc->inc);
1405                 svc->inc = NULL;
1406         }
1407
1408         /*
1409          *    Unlink the whole destination list
1410          */
1411         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1412                 __ip_vs_unlink_dest(svc, dest, 0);
1413                 __ip_vs_del_dest(svc->net, dest);
1414         }
1415
1416         /*
1417          *    Update the virtual service counters
1418          */
1419         if (svc->port == FTPPORT)
1420                 atomic_dec(&ipvs->ftpsvc_counter);
1421         else if (svc->port == 0)
1422                 atomic_dec(&ipvs->nullsvc_counter);
1423
1424         /*
1425          *    Free the service if nobody refers to it
1426          */
1427         if (atomic_read(&svc->refcnt) == 0) {
1428                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1429                               svc->fwmark,
1430                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
1431                               ntohs(svc->port), atomic_read(&svc->usecnt));
1432                 free_percpu(svc->stats.cpustats);
1433                 kfree(svc);
1434         }
1435
1436         /* decrease the module use count */
1437         ip_vs_use_count_dec();
1438 }
1439
1440 /*
1441  * Unlink a service from list and try to delete it if its refcnt reached 0
1442  */
1443 static void ip_vs_unlink_service(struct ip_vs_service *svc)
1444 {
1445         /*
1446          * Unhash it from the service table
1447          */
1448         write_lock_bh(&__ip_vs_svc_lock);
1449
1450         ip_vs_svc_unhash(svc);
1451
1452         /*
1453          * Wait until all the svc users go away.
1454          */
1455         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1456
1457         __ip_vs_del_service(svc);
1458
1459         write_unlock_bh(&__ip_vs_svc_lock);
1460 }
1461
1462 /*
1463  *      Delete a service from the service list
1464  */
1465 static int ip_vs_del_service(struct ip_vs_service *svc)
1466 {
1467         if (svc == NULL)
1468                 return -EEXIST;
1469         ip_vs_unlink_service(svc);
1470
1471         return 0;
1472 }
1473
1474
1475 /*
1476  *      Flush all the virtual services
1477  */
1478 static int ip_vs_flush(struct net *net)
1479 {
1480         int idx;
1481         struct ip_vs_service *svc, *nxt;
1482
1483         /*
1484          * Flush the service table hashed by <netns,protocol,addr,port>
1485          */
1486         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1487                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1488                                          s_list) {
1489                         if (net_eq(svc->net, net))
1490                                 ip_vs_unlink_service(svc);
1491                 }
1492         }
1493
1494         /*
1495          * Flush the service table hashed by fwmark
1496          */
1497         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1498                 list_for_each_entry_safe(svc, nxt,
1499                                          &ip_vs_svc_fwm_table[idx], f_list) {
1500                         if (net_eq(svc->net, net))
1501                                 ip_vs_unlink_service(svc);
1502                 }
1503         }
1504
1505         return 0;
1506 }
1507
1508 /*
1509  *      Delete service by {netns} in the service table.
1510  *      Called by __ip_vs_cleanup()
1511  */
1512 void ip_vs_service_net_cleanup(struct net *net)
1513 {
1514         EnterFunction(2);
1515         /* Check for "full" addressed entries */
1516         mutex_lock(&__ip_vs_mutex);
1517         ip_vs_flush(net);
1518         mutex_unlock(&__ip_vs_mutex);
1519         LeaveFunction(2);
1520 }
1521
1522 /* Put all references for device (dst_cache) */
1523 static inline void
1524 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
1525 {
1526         spin_lock_bh(&dest->dst_lock);
1527         if (dest->dest_dst && dest->dest_dst->dst_cache->dev == dev) {
1528                 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
1529                               dev->name,
1530                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1531                               ntohs(dest->port),
1532                               atomic_read(&dest->refcnt));
1533                 __ip_vs_dst_cache_reset(dest);
1534         }
1535         spin_unlock_bh(&dest->dst_lock);
1536
1537 }
1538 /* Netdev event receiver
1539  * Currently only NETDEV_DOWN is handled to release refs to cached dsts
1540  */
1541 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
1542                             void *ptr)
1543 {
1544         struct net_device *dev = ptr;
1545         struct net *net = dev_net(dev);
1546         struct netns_ipvs *ipvs = net_ipvs(net);
1547         struct ip_vs_service *svc;
1548         struct ip_vs_dest *dest;
1549         unsigned int idx;
1550
1551         if (event != NETDEV_DOWN || !ipvs)
1552                 return NOTIFY_DONE;
1553         IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
1554         EnterFunction(2);
1555         mutex_lock(&__ip_vs_mutex);
1556         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1557                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1558                         if (net_eq(svc->net, net)) {
1559                                 list_for_each_entry(dest, &svc->destinations,
1560                                                     n_list) {
1561                                         ip_vs_forget_dev(dest, dev);
1562                                 }
1563                         }
1564                 }
1565
1566                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1567                         if (net_eq(svc->net, net)) {
1568                                 list_for_each_entry(dest, &svc->destinations,
1569                                                     n_list) {
1570                                         ip_vs_forget_dev(dest, dev);
1571                                 }
1572                         }
1573
1574                 }
1575         }
1576
1577         list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
1578                 ip_vs_forget_dev(dest, dev);
1579         }
1580         mutex_unlock(&__ip_vs_mutex);
1581         LeaveFunction(2);
1582         return NOTIFY_DONE;
1583 }
1584
1585 /*
1586  *      Zero counters in a service or all services
1587  */
1588 static int ip_vs_zero_service(struct ip_vs_service *svc)
1589 {
1590         struct ip_vs_dest *dest;
1591
1592         write_lock_bh(&__ip_vs_svc_lock);
1593         list_for_each_entry(dest, &svc->destinations, n_list) {
1594                 ip_vs_zero_stats(&dest->stats);
1595         }
1596         ip_vs_zero_stats(&svc->stats);
1597         write_unlock_bh(&__ip_vs_svc_lock);
1598         return 0;
1599 }
1600
1601 static int ip_vs_zero_all(struct net *net)
1602 {
1603         int idx;
1604         struct ip_vs_service *svc;
1605
1606         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1607                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1608                         if (net_eq(svc->net, net))
1609                                 ip_vs_zero_service(svc);
1610                 }
1611         }
1612
1613         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1614                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1615                         if (net_eq(svc->net, net))
1616                                 ip_vs_zero_service(svc);
1617                 }
1618         }
1619
1620         ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
1621         return 0;
1622 }
1623
1624 #ifdef CONFIG_SYSCTL
1625
1626 static int zero;
1627 static int three = 3;
1628
1629 static int
1630 proc_do_defense_mode(ctl_table *table, int write,
1631                      void __user *buffer, size_t *lenp, loff_t *ppos)
1632 {
1633         struct net *net = current->nsproxy->net_ns;
1634         int *valp = table->data;
1635         int val = *valp;
1636         int rc;
1637
1638         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1639         if (write && (*valp != val)) {
1640                 if ((*valp < 0) || (*valp > 3)) {
1641                         /* Restore the correct value */
1642                         *valp = val;
1643                 } else {
1644                         update_defense_level(net_ipvs(net));
1645                 }
1646         }
1647         return rc;
1648 }
1649
1650 static int
1651 proc_do_sync_threshold(ctl_table *table, int write,
1652                        void __user *buffer, size_t *lenp, loff_t *ppos)
1653 {
1654         int *valp = table->data;
1655         int val[2];
1656         int rc;
1657
1658         /* backup the value first */
1659         memcpy(val, valp, sizeof(val));
1660
1661         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1662         if (write && (valp[0] < 0 || valp[1] < 0 ||
1663             (valp[0] >= valp[1] && valp[1]))) {
1664                 /* Restore the correct value */
1665                 memcpy(valp, val, sizeof(val));
1666         }
1667         return rc;
1668 }
1669
1670 static int
1671 proc_do_sync_mode(ctl_table *table, int write,
1672                      void __user *buffer, size_t *lenp, loff_t *ppos)
1673 {
1674         int *valp = table->data;
1675         int val = *valp;
1676         int rc;
1677
1678         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1679         if (write && (*valp != val)) {
1680                 if ((*valp < 0) || (*valp > 1)) {
1681                         /* Restore the correct value */
1682                         *valp = val;
1683                 }
1684         }
1685         return rc;
1686 }
1687
1688 static int
1689 proc_do_sync_ports(ctl_table *table, int write,
1690                    void __user *buffer, size_t *lenp, loff_t *ppos)
1691 {
1692         int *valp = table->data;
1693         int val = *valp;
1694         int rc;
1695
1696         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1697         if (write && (*valp != val)) {
1698                 if (*valp < 1 || !is_power_of_2(*valp)) {
1699                         /* Restore the correct value */
1700                         *valp = val;
1701                 }
1702         }
1703         return rc;
1704 }
1705
1706 /*
1707  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1708  *      Do not change order or insert new entries without
1709  *      align with netns init in ip_vs_control_net_init()
1710  */
1711
1712 static struct ctl_table vs_vars[] = {
1713         {
1714                 .procname       = "amemthresh",
1715                 .maxlen         = sizeof(int),
1716                 .mode           = 0644,
1717                 .proc_handler   = proc_dointvec,
1718         },
1719         {
1720                 .procname       = "am_droprate",
1721                 .maxlen         = sizeof(int),
1722                 .mode           = 0644,
1723                 .proc_handler   = proc_dointvec,
1724         },
1725         {
1726                 .procname       = "drop_entry",
1727                 .maxlen         = sizeof(int),
1728                 .mode           = 0644,
1729                 .proc_handler   = proc_do_defense_mode,
1730         },
1731         {
1732                 .procname       = "drop_packet",
1733                 .maxlen         = sizeof(int),
1734                 .mode           = 0644,
1735                 .proc_handler   = proc_do_defense_mode,
1736         },
1737 #ifdef CONFIG_IP_VS_NFCT
1738         {
1739                 .procname       = "conntrack",
1740                 .maxlen         = sizeof(int),
1741                 .mode           = 0644,
1742                 .proc_handler   = &proc_dointvec,
1743         },
1744 #endif
1745         {
1746                 .procname       = "secure_tcp",
1747                 .maxlen         = sizeof(int),
1748                 .mode           = 0644,
1749                 .proc_handler   = proc_do_defense_mode,
1750         },
1751         {
1752                 .procname       = "snat_reroute",
1753                 .maxlen         = sizeof(int),
1754                 .mode           = 0644,
1755                 .proc_handler   = &proc_dointvec,
1756         },
1757         {
1758                 .procname       = "sync_version",
1759                 .maxlen         = sizeof(int),
1760                 .mode           = 0644,
1761                 .proc_handler   = &proc_do_sync_mode,
1762         },
1763         {
1764                 .procname       = "sync_ports",
1765                 .maxlen         = sizeof(int),
1766                 .mode           = 0644,
1767                 .proc_handler   = &proc_do_sync_ports,
1768         },
1769         {
1770                 .procname       = "sync_qlen_max",
1771                 .maxlen         = sizeof(int),
1772                 .mode           = 0644,
1773                 .proc_handler   = proc_dointvec,
1774         },
1775         {
1776                 .procname       = "sync_sock_size",
1777                 .maxlen         = sizeof(int),
1778                 .mode           = 0644,
1779                 .proc_handler   = proc_dointvec,
1780         },
1781         {
1782                 .procname       = "cache_bypass",
1783                 .maxlen         = sizeof(int),
1784                 .mode           = 0644,
1785                 .proc_handler   = proc_dointvec,
1786         },
1787         {
1788                 .procname       = "expire_nodest_conn",
1789                 .maxlen         = sizeof(int),
1790                 .mode           = 0644,
1791                 .proc_handler   = proc_dointvec,
1792         },
1793         {
1794                 .procname       = "expire_quiescent_template",
1795                 .maxlen         = sizeof(int),
1796                 .mode           = 0644,
1797                 .proc_handler   = proc_dointvec,
1798         },
1799         {
1800                 .procname       = "sync_threshold",
1801                 .maxlen         =
1802                         sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1803                 .mode           = 0644,
1804                 .proc_handler   = proc_do_sync_threshold,
1805         },
1806         {
1807                 .procname       = "sync_refresh_period",
1808                 .maxlen         = sizeof(int),
1809                 .mode           = 0644,
1810                 .proc_handler   = proc_dointvec_jiffies,
1811         },
1812         {
1813                 .procname       = "sync_retries",
1814                 .maxlen         = sizeof(int),
1815                 .mode           = 0644,
1816                 .proc_handler   = proc_dointvec_minmax,
1817                 .extra1         = &zero,
1818                 .extra2         = &three,
1819         },
1820         {
1821                 .procname       = "nat_icmp_send",
1822                 .maxlen         = sizeof(int),
1823                 .mode           = 0644,
1824                 .proc_handler   = proc_dointvec,
1825         },
1826         {
1827                 .procname       = "pmtu_disc",
1828                 .maxlen         = sizeof(int),
1829                 .mode           = 0644,
1830                 .proc_handler   = proc_dointvec,
1831         },
1832         {
1833                 .procname       = "backup_only",
1834                 .maxlen         = sizeof(int),
1835                 .mode           = 0644,
1836                 .proc_handler   = proc_dointvec,
1837         },
1838 #ifdef CONFIG_IP_VS_DEBUG
1839         {
1840                 .procname       = "debug_level",
1841                 .data           = &sysctl_ip_vs_debug_level,
1842                 .maxlen         = sizeof(int),
1843                 .mode           = 0644,
1844                 .proc_handler   = proc_dointvec,
1845         },
1846 #endif
1847 #if 0
1848         {
1849                 .procname       = "timeout_established",
1850                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1851                 .maxlen         = sizeof(int),
1852                 .mode           = 0644,
1853                 .proc_handler   = proc_dointvec_jiffies,
1854         },
1855         {
1856                 .procname       = "timeout_synsent",
1857                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1858                 .maxlen         = sizeof(int),
1859                 .mode           = 0644,
1860                 .proc_handler   = proc_dointvec_jiffies,
1861         },
1862         {
1863                 .procname       = "timeout_synrecv",
1864                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1865                 .maxlen         = sizeof(int),
1866                 .mode           = 0644,
1867                 .proc_handler   = proc_dointvec_jiffies,
1868         },
1869         {
1870                 .procname       = "timeout_finwait",
1871                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1872                 .maxlen         = sizeof(int),
1873                 .mode           = 0644,
1874                 .proc_handler   = proc_dointvec_jiffies,
1875         },
1876         {
1877                 .procname       = "timeout_timewait",
1878                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1879                 .maxlen         = sizeof(int),
1880                 .mode           = 0644,
1881                 .proc_handler   = proc_dointvec_jiffies,
1882         },
1883         {
1884                 .procname       = "timeout_close",
1885                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1886                 .maxlen         = sizeof(int),
1887                 .mode           = 0644,
1888                 .proc_handler   = proc_dointvec_jiffies,
1889         },
1890         {
1891                 .procname       = "timeout_closewait",
1892                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1893                 .maxlen         = sizeof(int),
1894                 .mode           = 0644,
1895                 .proc_handler   = proc_dointvec_jiffies,
1896         },
1897         {
1898                 .procname       = "timeout_lastack",
1899                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1900                 .maxlen         = sizeof(int),
1901                 .mode           = 0644,
1902                 .proc_handler   = proc_dointvec_jiffies,
1903         },
1904         {
1905                 .procname       = "timeout_listen",
1906                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1907                 .maxlen         = sizeof(int),
1908                 .mode           = 0644,
1909                 .proc_handler   = proc_dointvec_jiffies,
1910         },
1911         {
1912                 .procname       = "timeout_synack",
1913                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1914                 .maxlen         = sizeof(int),
1915                 .mode           = 0644,
1916                 .proc_handler   = proc_dointvec_jiffies,
1917         },
1918         {
1919                 .procname       = "timeout_udp",
1920                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1921                 .maxlen         = sizeof(int),
1922                 .mode           = 0644,
1923                 .proc_handler   = proc_dointvec_jiffies,
1924         },
1925         {
1926                 .procname       = "timeout_icmp",
1927                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1928                 .maxlen         = sizeof(int),
1929                 .mode           = 0644,
1930                 .proc_handler   = proc_dointvec_jiffies,
1931         },
1932 #endif
1933         { }
1934 };
1935
1936 #endif
1937
1938 #ifdef CONFIG_PROC_FS
1939
1940 struct ip_vs_iter {
1941         struct seq_net_private p;  /* Do not move this, netns depends upon it*/
1942         struct list_head *table;
1943         int bucket;
1944 };
1945
1946 /*
1947  *      Write the contents of the VS rule table to a PROCfs file.
1948  *      (It is kept just for backward compatibility)
1949  */
1950 static inline const char *ip_vs_fwd_name(unsigned int flags)
1951 {
1952         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1953         case IP_VS_CONN_F_LOCALNODE:
1954                 return "Local";
1955         case IP_VS_CONN_F_TUNNEL:
1956                 return "Tunnel";
1957         case IP_VS_CONN_F_DROUTE:
1958                 return "Route";
1959         default:
1960                 return "Masq";
1961         }
1962 }
1963
1964
1965 /* Get the Nth entry in the two lists */
1966 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1967 {
1968         struct net *net = seq_file_net(seq);
1969         struct ip_vs_iter *iter = seq->private;
1970         int idx;
1971         struct ip_vs_service *svc;
1972
1973         /* look in hash by protocol */
1974         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1975                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1976                         if (net_eq(svc->net, net) && pos-- == 0) {
1977                                 iter->table = ip_vs_svc_table;
1978                                 iter->bucket = idx;
1979                                 return svc;
1980                         }
1981                 }
1982         }
1983
1984         /* keep looking in fwmark */
1985         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1986                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1987                         if (net_eq(svc->net, net) && pos-- == 0) {
1988                                 iter->table = ip_vs_svc_fwm_table;
1989                                 iter->bucket = idx;
1990                                 return svc;
1991                         }
1992                 }
1993         }
1994
1995         return NULL;
1996 }
1997
1998 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1999 __acquires(__ip_vs_svc_lock)
2000 {
2001
2002         read_lock_bh(&__ip_vs_svc_lock);
2003         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
2004 }
2005
2006
2007 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2008 {
2009         struct list_head *e;
2010         struct ip_vs_iter *iter;
2011         struct ip_vs_service *svc;
2012
2013         ++*pos;
2014         if (v == SEQ_START_TOKEN)
2015                 return ip_vs_info_array(seq,0);
2016
2017         svc = v;
2018         iter = seq->private;
2019
2020         if (iter->table == ip_vs_svc_table) {
2021                 /* next service in table hashed by protocol */
2022                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
2023                         return list_entry(e, struct ip_vs_service, s_list);
2024
2025
2026                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2027                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
2028                                             s_list) {
2029                                 return svc;
2030                         }
2031                 }
2032
2033                 iter->table = ip_vs_svc_fwm_table;
2034                 iter->bucket = -1;
2035                 goto scan_fwmark;
2036         }
2037
2038         /* next service in hashed by fwmark */
2039         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
2040                 return list_entry(e, struct ip_vs_service, f_list);
2041
2042  scan_fwmark:
2043         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2044                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
2045                                     f_list)
2046                         return svc;
2047         }
2048
2049         return NULL;
2050 }
2051
2052 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
2053 __releases(__ip_vs_svc_lock)
2054 {
2055         read_unlock_bh(&__ip_vs_svc_lock);
2056 }
2057
2058
2059 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
2060 {
2061         if (v == SEQ_START_TOKEN) {
2062                 seq_printf(seq,
2063                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
2064                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2065                 seq_puts(seq,
2066                          "Prot LocalAddress:Port Scheduler Flags\n");
2067                 seq_puts(seq,
2068                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2069         } else {
2070                 const struct ip_vs_service *svc = v;
2071                 const struct ip_vs_iter *iter = seq->private;
2072                 const struct ip_vs_dest *dest;
2073
2074                 if (iter->table == ip_vs_svc_table) {
2075 #ifdef CONFIG_IP_VS_IPV6
2076                         if (svc->af == AF_INET6)
2077                                 seq_printf(seq, "%s  [%pI6]:%04X %s ",
2078                                            ip_vs_proto_name(svc->protocol),
2079                                            &svc->addr.in6,
2080                                            ntohs(svc->port),
2081                                            svc->scheduler->name);
2082                         else
2083 #endif
2084                                 seq_printf(seq, "%s  %08X:%04X %s %s ",
2085                                            ip_vs_proto_name(svc->protocol),
2086                                            ntohl(svc->addr.ip),
2087                                            ntohs(svc->port),
2088                                            svc->scheduler->name,
2089                                            (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2090                 } else {
2091                         seq_printf(seq, "FWM  %08X %s %s",
2092                                    svc->fwmark, svc->scheduler->name,
2093                                    (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2094                 }
2095
2096                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2097                         seq_printf(seq, "persistent %d %08X\n",
2098                                 svc->timeout,
2099                                 ntohl(svc->netmask));
2100                 else
2101                         seq_putc(seq, '\n');
2102
2103                 list_for_each_entry(dest, &svc->destinations, n_list) {
2104 #ifdef CONFIG_IP_VS_IPV6
2105                         if (dest->af == AF_INET6)
2106                                 seq_printf(seq,
2107                                            "  -> [%pI6]:%04X"
2108                                            "      %-7s %-6d %-10d %-10d\n",
2109                                            &dest->addr.in6,
2110                                            ntohs(dest->port),
2111                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2112                                            atomic_read(&dest->weight),
2113                                            atomic_read(&dest->activeconns),
2114                                            atomic_read(&dest->inactconns));
2115                         else
2116 #endif
2117                                 seq_printf(seq,
2118                                            "  -> %08X:%04X      "
2119                                            "%-7s %-6d %-10d %-10d\n",
2120                                            ntohl(dest->addr.ip),
2121                                            ntohs(dest->port),
2122                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2123                                            atomic_read(&dest->weight),
2124                                            atomic_read(&dest->activeconns),
2125                                            atomic_read(&dest->inactconns));
2126
2127                 }
2128         }
2129         return 0;
2130 }
2131
2132 static const struct seq_operations ip_vs_info_seq_ops = {
2133         .start = ip_vs_info_seq_start,
2134         .next  = ip_vs_info_seq_next,
2135         .stop  = ip_vs_info_seq_stop,
2136         .show  = ip_vs_info_seq_show,
2137 };
2138
2139 static int ip_vs_info_open(struct inode *inode, struct file *file)
2140 {
2141         return seq_open_net(inode, file, &ip_vs_info_seq_ops,
2142                         sizeof(struct ip_vs_iter));
2143 }
2144
2145 static const struct file_operations ip_vs_info_fops = {
2146         .owner   = THIS_MODULE,
2147         .open    = ip_vs_info_open,
2148         .read    = seq_read,
2149         .llseek  = seq_lseek,
2150         .release = seq_release_net,
2151 };
2152
2153 static int ip_vs_stats_show(struct seq_file *seq, void *v)
2154 {
2155         struct net *net = seq_file_single_net(seq);
2156         struct ip_vs_stats_user show;
2157
2158 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2159         seq_puts(seq,
2160                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
2161         seq_printf(seq,
2162                    "   Conns  Packets  Packets            Bytes            Bytes\n");
2163
2164         ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
2165         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
2166                    show.inpkts, show.outpkts,
2167                    (unsigned long long) show.inbytes,
2168                    (unsigned long long) show.outbytes);
2169
2170 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2171         seq_puts(seq,
2172                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2173         seq_printf(seq, "%8X %8X %8X %16X %16X\n",
2174                         show.cps, show.inpps, show.outpps,
2175                         show.inbps, show.outbps);
2176
2177         return 0;
2178 }
2179
2180 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
2181 {
2182         return single_open_net(inode, file, ip_vs_stats_show);
2183 }
2184
2185 static const struct file_operations ip_vs_stats_fops = {
2186         .owner = THIS_MODULE,
2187         .open = ip_vs_stats_seq_open,
2188         .read = seq_read,
2189         .llseek = seq_lseek,
2190         .release = single_release_net,
2191 };
2192
2193 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2194 {
2195         struct net *net = seq_file_single_net(seq);
2196         struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2197         struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
2198         struct ip_vs_stats_user rates;
2199         int i;
2200
2201 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2202         seq_puts(seq,
2203                  "       Total Incoming Outgoing         Incoming         Outgoing\n");
2204         seq_printf(seq,
2205                    "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2206
2207         for_each_possible_cpu(i) {
2208                 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2209                 unsigned int start;
2210                 __u64 inbytes, outbytes;
2211
2212                 do {
2213                         start = u64_stats_fetch_begin_bh(&u->syncp);
2214                         inbytes = u->ustats.inbytes;
2215                         outbytes = u->ustats.outbytes;
2216                 } while (u64_stats_fetch_retry_bh(&u->syncp, start));
2217
2218                 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2219                            i, u->ustats.conns, u->ustats.inpkts,
2220                            u->ustats.outpkts, (__u64)inbytes,
2221                            (__u64)outbytes);
2222         }
2223
2224         spin_lock_bh(&tot_stats->lock);
2225
2226         seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
2227                    tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2228                    tot_stats->ustats.outpkts,
2229                    (unsigned long long) tot_stats->ustats.inbytes,
2230                    (unsigned long long) tot_stats->ustats.outbytes);
2231
2232         ip_vs_read_estimator(&rates, tot_stats);
2233
2234         spin_unlock_bh(&tot_stats->lock);
2235
2236 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2237         seq_puts(seq,
2238                    "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2239         seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
2240                         rates.cps,
2241                         rates.inpps,
2242                         rates.outpps,
2243                         rates.inbps,
2244                         rates.outbps);
2245
2246         return 0;
2247 }
2248
2249 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2250 {
2251         return single_open_net(inode, file, ip_vs_stats_percpu_show);
2252 }
2253
2254 static const struct file_operations ip_vs_stats_percpu_fops = {
2255         .owner = THIS_MODULE,
2256         .open = ip_vs_stats_percpu_seq_open,
2257         .read = seq_read,
2258         .llseek = seq_lseek,
2259         .release = single_release_net,
2260 };
2261 #endif
2262
2263 /*
2264  *      Set timeout values for tcp tcpfin udp in the timeout_table.
2265  */
2266 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2267 {
2268 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2269         struct ip_vs_proto_data *pd;
2270 #endif
2271
2272         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2273                   u->tcp_timeout,
2274                   u->tcp_fin_timeout,
2275                   u->udp_timeout);
2276
2277 #ifdef CONFIG_IP_VS_PROTO_TCP
2278         if (u->tcp_timeout) {
2279                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2280                 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2281                         = u->tcp_timeout * HZ;
2282         }
2283
2284         if (u->tcp_fin_timeout) {
2285                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2286                 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2287                         = u->tcp_fin_timeout * HZ;
2288         }
2289 #endif
2290
2291 #ifdef CONFIG_IP_VS_PROTO_UDP
2292         if (u->udp_timeout) {
2293                 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2294                 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2295                         = u->udp_timeout * HZ;
2296         }
2297 #endif
2298         return 0;
2299 }
2300
2301
2302 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2303 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
2304 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
2305                                  sizeof(struct ip_vs_dest_user))
2306 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
2307 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
2308 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
2309
2310 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2311         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
2312         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
2313         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
2314         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
2315         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
2316         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
2317         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
2318         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
2319         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
2320         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
2321         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
2322 };
2323
2324 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2325                                   struct ip_vs_service_user *usvc_compat)
2326 {
2327         memset(usvc, 0, sizeof(*usvc));
2328
2329         usvc->af                = AF_INET;
2330         usvc->protocol          = usvc_compat->protocol;
2331         usvc->addr.ip           = usvc_compat->addr;
2332         usvc->port              = usvc_compat->port;
2333         usvc->fwmark            = usvc_compat->fwmark;
2334
2335         /* Deep copy of sched_name is not needed here */
2336         usvc->sched_name        = usvc_compat->sched_name;
2337
2338         usvc->flags             = usvc_compat->flags;
2339         usvc->timeout           = usvc_compat->timeout;
2340         usvc->netmask           = usvc_compat->netmask;
2341 }
2342
2343 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2344                                    struct ip_vs_dest_user *udest_compat)
2345 {
2346         memset(udest, 0, sizeof(*udest));
2347
2348         udest->addr.ip          = udest_compat->addr;
2349         udest->port             = udest_compat->port;
2350         udest->conn_flags       = udest_compat->conn_flags;
2351         udest->weight           = udest_compat->weight;
2352         udest->u_threshold      = udest_compat->u_threshold;
2353         udest->l_threshold      = udest_compat->l_threshold;
2354 }
2355
2356 static int
2357 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2358 {
2359         struct net *net = sock_net(sk);
2360         int ret;
2361         unsigned char arg[MAX_ARG_LEN];
2362         struct ip_vs_service_user *usvc_compat;
2363         struct ip_vs_service_user_kern usvc;
2364         struct ip_vs_service *svc;
2365         struct ip_vs_dest_user *udest_compat;
2366         struct ip_vs_dest_user_kern udest;
2367         struct netns_ipvs *ipvs = net_ipvs(net);
2368
2369         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2370                 return -EPERM;
2371
2372         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2373                 return -EINVAL;
2374         if (len < 0 || len >  MAX_ARG_LEN)
2375                 return -EINVAL;
2376         if (len != set_arglen[SET_CMDID(cmd)]) {
2377                 pr_err("set_ctl: len %u != %u\n",
2378                        len, set_arglen[SET_CMDID(cmd)]);
2379                 return -EINVAL;
2380         }
2381
2382         if (copy_from_user(arg, user, len) != 0)
2383                 return -EFAULT;
2384
2385         /* increase the module use count */
2386         ip_vs_use_count_inc();
2387
2388         /* Handle daemons since they have another lock */
2389         if (cmd == IP_VS_SO_SET_STARTDAEMON ||
2390             cmd == IP_VS_SO_SET_STOPDAEMON) {
2391                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2392
2393                 if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
2394                         ret = -ERESTARTSYS;
2395                         goto out_dec;
2396                 }
2397                 if (cmd == IP_VS_SO_SET_STARTDAEMON)
2398                         ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2399                                                 dm->syncid);
2400                 else
2401                         ret = stop_sync_thread(net, dm->state);
2402                 mutex_unlock(&ipvs->sync_mutex);
2403                 goto out_dec;
2404         }
2405
2406         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2407                 ret = -ERESTARTSYS;
2408                 goto out_dec;
2409         }
2410
2411         if (cmd == IP_VS_SO_SET_FLUSH) {
2412                 /* Flush the virtual service */
2413                 ret = ip_vs_flush(net);
2414                 goto out_unlock;
2415         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2416                 /* Set timeout values for (tcp tcpfin udp) */
2417                 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2418                 goto out_unlock;
2419         }
2420
2421         usvc_compat = (struct ip_vs_service_user *)arg;
2422         udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2423
2424         /* We only use the new structs internally, so copy userspace compat
2425          * structs to extended internal versions */
2426         ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2427         ip_vs_copy_udest_compat(&udest, udest_compat);
2428
2429         if (cmd == IP_VS_SO_SET_ZERO) {
2430                 /* if no service address is set, zero counters in all */
2431                 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2432                         ret = ip_vs_zero_all(net);
2433                         goto out_unlock;
2434                 }
2435         }
2436
2437         /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2438         if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2439             usvc.protocol != IPPROTO_SCTP) {
2440                 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2441                        usvc.protocol, &usvc.addr.ip,
2442                        ntohs(usvc.port), usvc.sched_name);
2443                 ret = -EFAULT;
2444                 goto out_unlock;
2445         }
2446
2447         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2448         if (usvc.fwmark == 0)
2449                 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2450                                            &usvc.addr, usvc.port);
2451         else
2452                 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2453
2454         if (cmd != IP_VS_SO_SET_ADD
2455             && (svc == NULL || svc->protocol != usvc.protocol)) {
2456                 ret = -ESRCH;
2457                 goto out_unlock;
2458         }
2459
2460         switch (cmd) {
2461         case IP_VS_SO_SET_ADD:
2462                 if (svc != NULL)
2463                         ret = -EEXIST;
2464                 else
2465                         ret = ip_vs_add_service(net, &usvc, &svc);
2466                 break;
2467         case IP_VS_SO_SET_EDIT:
2468                 ret = ip_vs_edit_service(svc, &usvc);
2469                 break;
2470         case IP_VS_SO_SET_DEL:
2471                 ret = ip_vs_del_service(svc);
2472                 if (!ret)
2473                         goto out_unlock;
2474                 break;
2475         case IP_VS_SO_SET_ZERO:
2476                 ret = ip_vs_zero_service(svc);
2477                 break;
2478         case IP_VS_SO_SET_ADDDEST:
2479                 ret = ip_vs_add_dest(svc, &udest);
2480                 break;
2481         case IP_VS_SO_SET_EDITDEST:
2482                 ret = ip_vs_edit_dest(svc, &udest);
2483                 break;
2484         case IP_VS_SO_SET_DELDEST:
2485                 ret = ip_vs_del_dest(svc, &udest);
2486                 break;
2487         default:
2488                 ret = -EINVAL;
2489         }
2490
2491   out_unlock:
2492         mutex_unlock(&__ip_vs_mutex);
2493   out_dec:
2494         /* decrease the module use count */
2495         ip_vs_use_count_dec();
2496
2497         return ret;
2498 }
2499
2500
2501 static void
2502 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2503 {
2504         dst->protocol = src->protocol;
2505         dst->addr = src->addr.ip;
2506         dst->port = src->port;
2507         dst->fwmark = src->fwmark;
2508         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2509         dst->flags = src->flags;
2510         dst->timeout = src->timeout / HZ;
2511         dst->netmask = src->netmask;
2512         dst->num_dests = src->num_dests;
2513         ip_vs_copy_stats(&dst->stats, &src->stats);
2514 }
2515
2516 static inline int
2517 __ip_vs_get_service_entries(struct net *net,
2518                             const struct ip_vs_get_services *get,
2519                             struct ip_vs_get_services __user *uptr)
2520 {
2521         int idx, count=0;
2522         struct ip_vs_service *svc;
2523         struct ip_vs_service_entry entry;
2524         int ret = 0;
2525
2526         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2527                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2528                         /* Only expose IPv4 entries to old interface */
2529                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2530                                 continue;
2531
2532                         if (count >= get->num_services)
2533                                 goto out;
2534                         memset(&entry, 0, sizeof(entry));
2535                         ip_vs_copy_service(&entry, svc);
2536                         if (copy_to_user(&uptr->entrytable[count],
2537                                          &entry, sizeof(entry))) {
2538                                 ret = -EFAULT;
2539                                 goto out;
2540                         }
2541                         count++;
2542                 }
2543         }
2544
2545         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2546                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2547                         /* Only expose IPv4 entries to old interface */
2548                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2549                                 continue;
2550
2551                         if (count >= get->num_services)
2552                                 goto out;
2553                         memset(&entry, 0, sizeof(entry));
2554                         ip_vs_copy_service(&entry, svc);
2555                         if (copy_to_user(&uptr->entrytable[count],
2556                                          &entry, sizeof(entry))) {
2557                                 ret = -EFAULT;
2558                                 goto out;
2559                         }
2560                         count++;
2561                 }
2562         }
2563 out:
2564         return ret;
2565 }
2566
2567 static inline int
2568 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2569                          struct ip_vs_get_dests __user *uptr)
2570 {
2571         struct ip_vs_service *svc;
2572         union nf_inet_addr addr = { .ip = get->addr };
2573         int ret = 0;
2574
2575         if (get->fwmark)
2576                 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2577         else
2578                 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2579                                            get->port);
2580
2581         if (svc) {
2582                 int count = 0;
2583                 struct ip_vs_dest *dest;
2584                 struct ip_vs_dest_entry entry;
2585
2586                 list_for_each_entry(dest, &svc->destinations, n_list) {
2587                         if (count >= get->num_dests)
2588                                 break;
2589
2590                         entry.addr = dest->addr.ip;
2591                         entry.port = dest->port;
2592                         entry.conn_flags = atomic_read(&dest->conn_flags);
2593                         entry.weight = atomic_read(&dest->weight);
2594                         entry.u_threshold = dest->u_threshold;
2595                         entry.l_threshold = dest->l_threshold;
2596                         entry.activeconns = atomic_read(&dest->activeconns);
2597                         entry.inactconns = atomic_read(&dest->inactconns);
2598                         entry.persistconns = atomic_read(&dest->persistconns);
2599                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2600                         if (copy_to_user(&uptr->entrytable[count],
2601                                          &entry, sizeof(entry))) {
2602                                 ret = -EFAULT;
2603                                 break;
2604                         }
2605                         count++;
2606                 }
2607         } else
2608                 ret = -ESRCH;
2609         return ret;
2610 }
2611
2612 static inline void
2613 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2614 {
2615 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2616         struct ip_vs_proto_data *pd;
2617 #endif
2618
2619         memset(u, 0, sizeof (*u));
2620
2621 #ifdef CONFIG_IP_VS_PROTO_TCP
2622         pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2623         u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2624         u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2625 #endif
2626 #ifdef CONFIG_IP_VS_PROTO_UDP
2627         pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2628         u->udp_timeout =
2629                         pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2630 #endif
2631 }
2632
2633
2634 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2635 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2636 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2637 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2638 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2639 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2640 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2641
2642 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2643         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2644         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2645         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2646         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2647         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2648         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2649         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2650 };
2651
2652 static int
2653 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2654 {
2655         unsigned char arg[128];
2656         int ret = 0;
2657         unsigned int copylen;
2658         struct net *net = sock_net(sk);
2659         struct netns_ipvs *ipvs = net_ipvs(net);
2660
2661         BUG_ON(!net);
2662         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2663                 return -EPERM;
2664
2665         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2666                 return -EINVAL;
2667
2668         if (*len < get_arglen[GET_CMDID(cmd)]) {
2669                 pr_err("get_ctl: len %u < %u\n",
2670                        *len, get_arglen[GET_CMDID(cmd)]);
2671                 return -EINVAL;
2672         }
2673
2674         copylen = get_arglen[GET_CMDID(cmd)];
2675         if (copylen > 128)
2676                 return -EINVAL;
2677
2678         if (copy_from_user(arg, user, copylen) != 0)
2679                 return -EFAULT;
2680         /*
2681          * Handle daemons first since it has its own locking
2682          */
2683         if (cmd == IP_VS_SO_GET_DAEMON) {
2684                 struct ip_vs_daemon_user d[2];
2685
2686                 memset(&d, 0, sizeof(d));
2687                 if (mutex_lock_interruptible(&ipvs->sync_mutex))
2688                         return -ERESTARTSYS;
2689
2690                 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2691                         d[0].state = IP_VS_STATE_MASTER;
2692                         strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2693                                 sizeof(d[0].mcast_ifn));
2694                         d[0].syncid = ipvs->master_syncid;
2695                 }
2696                 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2697                         d[1].state = IP_VS_STATE_BACKUP;
2698                         strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2699                                 sizeof(d[1].mcast_ifn));
2700                         d[1].syncid = ipvs->backup_syncid;
2701                 }
2702                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2703                         ret = -EFAULT;
2704                 mutex_unlock(&ipvs->sync_mutex);
2705                 return ret;
2706         }
2707
2708         if (mutex_lock_interruptible(&__ip_vs_mutex))
2709                 return -ERESTARTSYS;
2710
2711         switch (cmd) {
2712         case IP_VS_SO_GET_VERSION:
2713         {
2714                 char buf[64];
2715
2716                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2717                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2718                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2719                         ret = -EFAULT;
2720                         goto out;
2721                 }
2722                 *len = strlen(buf)+1;
2723         }
2724         break;
2725
2726         case IP_VS_SO_GET_INFO:
2727         {
2728                 struct ip_vs_getinfo info;
2729                 info.version = IP_VS_VERSION_CODE;
2730                 info.size = ip_vs_conn_tab_size;
2731                 info.num_services = ipvs->num_services;
2732                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2733                         ret = -EFAULT;
2734         }
2735         break;
2736
2737         case IP_VS_SO_GET_SERVICES:
2738         {
2739                 struct ip_vs_get_services *get;
2740                 int size;
2741
2742                 get = (struct ip_vs_get_services *)arg;
2743                 size = sizeof(*get) +
2744                         sizeof(struct ip_vs_service_entry) * get->num_services;
2745                 if (*len != size) {
2746                         pr_err("length: %u != %u\n", *len, size);
2747                         ret = -EINVAL;
2748                         goto out;
2749                 }
2750                 ret = __ip_vs_get_service_entries(net, get, user);
2751         }
2752         break;
2753
2754         case IP_VS_SO_GET_SERVICE:
2755         {
2756                 struct ip_vs_service_entry *entry;
2757                 struct ip_vs_service *svc;
2758                 union nf_inet_addr addr;
2759
2760                 entry = (struct ip_vs_service_entry *)arg;
2761                 addr.ip = entry->addr;
2762                 if (entry->fwmark)
2763                         svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2764                 else
2765                         svc = __ip_vs_service_find(net, AF_INET,
2766                                                    entry->protocol, &addr,
2767                                                    entry->port);
2768                 if (svc) {
2769                         ip_vs_copy_service(entry, svc);
2770                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2771                                 ret = -EFAULT;
2772                 } else
2773                         ret = -ESRCH;
2774         }
2775         break;
2776
2777         case IP_VS_SO_GET_DESTS:
2778         {
2779                 struct ip_vs_get_dests *get;
2780                 int size;
2781
2782                 get = (struct ip_vs_get_dests *)arg;
2783                 size = sizeof(*get) +
2784                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2785                 if (*len != size) {
2786                         pr_err("length: %u != %u\n", *len, size);
2787                         ret = -EINVAL;
2788                         goto out;
2789                 }
2790                 ret = __ip_vs_get_dest_entries(net, get, user);
2791         }
2792         break;
2793
2794         case IP_VS_SO_GET_TIMEOUT:
2795         {
2796                 struct ip_vs_timeout_user t;
2797
2798                 __ip_vs_get_timeouts(net, &t);
2799                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2800                         ret = -EFAULT;
2801         }
2802         break;
2803
2804         default:
2805                 ret = -EINVAL;
2806         }
2807
2808 out:
2809         mutex_unlock(&__ip_vs_mutex);
2810         return ret;
2811 }
2812
2813
2814 static struct nf_sockopt_ops ip_vs_sockopts = {
2815         .pf             = PF_INET,
2816         .set_optmin     = IP_VS_BASE_CTL,
2817         .set_optmax     = IP_VS_SO_SET_MAX+1,
2818         .set            = do_ip_vs_set_ctl,
2819         .get_optmin     = IP_VS_BASE_CTL,
2820         .get_optmax     = IP_VS_SO_GET_MAX+1,
2821         .get            = do_ip_vs_get_ctl,
2822         .owner          = THIS_MODULE,
2823 };
2824
2825 /*
2826  * Generic Netlink interface
2827  */
2828
2829 /* IPVS genetlink family */
2830 static struct genl_family ip_vs_genl_family = {
2831         .id             = GENL_ID_GENERATE,
2832         .hdrsize        = 0,
2833         .name           = IPVS_GENL_NAME,
2834         .version        = IPVS_GENL_VERSION,
2835         .maxattr        = IPVS_CMD_MAX,
2836         .netnsok        = true,         /* Make ipvsadm to work on netns */
2837 };
2838
2839 /* Policy used for first-level command attributes */
2840 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2841         [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
2842         [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
2843         [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
2844         [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
2845         [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2846         [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
2847 };
2848
2849 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2850 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2851         [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
2852         [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
2853                                             .len = IP_VS_IFNAME_MAXLEN },
2854         [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
2855 };
2856
2857 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2858 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2859         [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
2860         [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
2861         [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
2862                                             .len = sizeof(union nf_inet_addr) },
2863         [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
2864         [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
2865         [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
2866                                             .len = IP_VS_SCHEDNAME_MAXLEN },
2867         [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
2868                                             .len = IP_VS_PENAME_MAXLEN },
2869         [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
2870                                             .len = sizeof(struct ip_vs_flags) },
2871         [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
2872         [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
2873         [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
2874 };
2875
2876 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2877 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2878         [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
2879                                             .len = sizeof(union nf_inet_addr) },
2880         [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
2881         [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
2882         [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
2883         [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
2884         [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
2885         [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
2886         [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
2887         [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
2888         [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
2889 };
2890
2891 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2892                                  struct ip_vs_stats *stats)
2893 {
2894         struct ip_vs_stats_user ustats;
2895         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2896         if (!nl_stats)
2897                 return -EMSGSIZE;
2898
2899         ip_vs_copy_stats(&ustats, stats);
2900
2901         if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) ||
2902             nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) ||
2903             nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) ||
2904             nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) ||
2905             nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) ||
2906             nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) ||
2907             nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) ||
2908             nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) ||
2909             nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) ||
2910             nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps))
2911                 goto nla_put_failure;
2912         nla_nest_end(skb, nl_stats);
2913
2914         return 0;
2915
2916 nla_put_failure:
2917         nla_nest_cancel(skb, nl_stats);
2918         return -EMSGSIZE;
2919 }
2920
2921 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2922                                    struct ip_vs_service *svc)
2923 {
2924         struct nlattr *nl_service;
2925         struct ip_vs_flags flags = { .flags = svc->flags,
2926                                      .mask = ~0 };
2927
2928         nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2929         if (!nl_service)
2930                 return -EMSGSIZE;
2931
2932         if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
2933                 goto nla_put_failure;
2934         if (svc->fwmark) {
2935                 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
2936                         goto nla_put_failure;
2937         } else {
2938                 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
2939                     nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
2940                     nla_put_u16(skb, IPVS_SVC_ATTR_PORT, svc->port))
2941                         goto nla_put_failure;
2942         }
2943
2944         if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) ||
2945             (svc->pe &&
2946              nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) ||
2947             nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
2948             nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
2949             nla_put_u32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
2950                 goto nla_put_failure;
2951         if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2952                 goto nla_put_failure;
2953
2954         nla_nest_end(skb, nl_service);
2955
2956         return 0;
2957
2958 nla_put_failure:
2959         nla_nest_cancel(skb, nl_service);
2960         return -EMSGSIZE;
2961 }
2962
2963 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2964                                    struct ip_vs_service *svc,
2965                                    struct netlink_callback *cb)
2966 {
2967         void *hdr;
2968
2969         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
2970                           &ip_vs_genl_family, NLM_F_MULTI,
2971                           IPVS_CMD_NEW_SERVICE);
2972         if (!hdr)
2973                 return -EMSGSIZE;
2974
2975         if (ip_vs_genl_fill_service(skb, svc) < 0)
2976                 goto nla_put_failure;
2977
2978         return genlmsg_end(skb, hdr);
2979
2980 nla_put_failure:
2981         genlmsg_cancel(skb, hdr);
2982         return -EMSGSIZE;
2983 }
2984
2985 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2986                                     struct netlink_callback *cb)
2987 {
2988         int idx = 0, i;
2989         int start = cb->args[0];
2990         struct ip_vs_service *svc;
2991         struct net *net = skb_sknet(skb);
2992
2993         mutex_lock(&__ip_vs_mutex);
2994         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2995                 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2996                         if (++idx <= start || !net_eq(svc->net, net))
2997                                 continue;
2998                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2999                                 idx--;
3000                                 goto nla_put_failure;
3001                         }
3002                 }
3003         }
3004
3005         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
3006                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
3007                         if (++idx <= start || !net_eq(svc->net, net))
3008                                 continue;
3009                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
3010                                 idx--;
3011                                 goto nla_put_failure;
3012                         }
3013                 }
3014         }
3015
3016 nla_put_failure:
3017         mutex_unlock(&__ip_vs_mutex);
3018         cb->args[0] = idx;
3019
3020         return skb->len;
3021 }
3022
3023 static int ip_vs_genl_parse_service(struct net *net,
3024                                     struct ip_vs_service_user_kern *usvc,
3025                                     struct nlattr *nla, int full_entry,
3026                                     struct ip_vs_service **ret_svc)
3027 {
3028         struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
3029         struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
3030         struct ip_vs_service *svc;
3031
3032         /* Parse mandatory identifying service fields first */
3033         if (nla == NULL ||
3034             nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
3035                 return -EINVAL;
3036
3037         nla_af          = attrs[IPVS_SVC_ATTR_AF];
3038         nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
3039         nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
3040         nla_port        = attrs[IPVS_SVC_ATTR_PORT];
3041         nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
3042
3043         if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
3044                 return -EINVAL;
3045
3046         memset(usvc, 0, sizeof(*usvc));
3047
3048         usvc->af = nla_get_u16(nla_af);
3049 #ifdef CONFIG_IP_VS_IPV6
3050         if (usvc->af != AF_INET && usvc->af != AF_INET6)
3051 #else
3052         if (usvc->af != AF_INET)
3053 #endif
3054                 return -EAFNOSUPPORT;
3055
3056         if (nla_fwmark) {
3057                 usvc->protocol = IPPROTO_TCP;
3058                 usvc->fwmark = nla_get_u32(nla_fwmark);
3059         } else {
3060                 usvc->protocol = nla_get_u16(nla_protocol);
3061                 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
3062                 usvc->port = nla_get_u16(nla_port);
3063                 usvc->fwmark = 0;
3064         }
3065
3066         if (usvc->fwmark)
3067                 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
3068         else
3069                 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
3070                                            &usvc->addr, usvc->port);
3071         *ret_svc = svc;
3072
3073         /* If a full entry was requested, check for the additional fields */
3074         if (full_entry) {
3075                 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
3076                               *nla_netmask;
3077                 struct ip_vs_flags flags;
3078
3079                 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
3080                 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
3081                 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
3082                 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
3083                 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
3084
3085                 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
3086                         return -EINVAL;
3087
3088                 nla_memcpy(&flags, nla_flags, sizeof(flags));
3089
3090                 /* prefill flags from service if it already exists */
3091                 if (svc)
3092                         usvc->flags = svc->flags;
3093
3094                 /* set new flags from userland */
3095                 usvc->flags = (usvc->flags & ~flags.mask) |
3096                               (flags.flags & flags.mask);
3097                 usvc->sched_name = nla_data(nla_sched);
3098                 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
3099                 usvc->timeout = nla_get_u32(nla_timeout);
3100                 usvc->netmask = nla_get_u32(nla_netmask);
3101         }
3102
3103         return 0;
3104 }
3105
3106 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
3107                                                      struct nlattr *nla)
3108 {
3109         struct ip_vs_service_user_kern usvc;
3110         struct ip_vs_service *svc;
3111         int ret;
3112
3113         ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
3114         return ret ? ERR_PTR(ret) : svc;
3115 }
3116
3117 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
3118 {
3119         struct nlattr *nl_dest;
3120
3121         nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
3122         if (!nl_dest)
3123                 return -EMSGSIZE;
3124
3125         if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
3126             nla_put_u16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
3127             nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
3128                         (atomic_read(&dest->conn_flags) &
3129                          IP_VS_CONN_F_FWD_MASK)) ||
3130             nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
3131                         atomic_read(&dest->weight)) ||
3132             nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
3133             nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
3134             nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
3135                         atomic_read(&dest->activeconns)) ||
3136             nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
3137                         atomic_read(&dest->inactconns)) ||
3138             nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
3139                         atomic_read(&dest->persistconns)))
3140                 goto nla_put_failure;
3141         if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
3142                 goto nla_put_failure;
3143
3144         nla_nest_end(skb, nl_dest);
3145
3146         return 0;
3147
3148 nla_put_failure:
3149         nla_nest_cancel(skb, nl_dest);
3150         return -EMSGSIZE;
3151 }
3152
3153 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
3154                                 struct netlink_callback *cb)
3155 {
3156         void *hdr;
3157
3158         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3159                           &ip_vs_genl_family, NLM_F_MULTI,
3160                           IPVS_CMD_NEW_DEST);
3161         if (!hdr)
3162                 return -EMSGSIZE;
3163
3164         if (ip_vs_genl_fill_dest(skb, dest) < 0)
3165                 goto nla_put_failure;
3166
3167         return genlmsg_end(skb, hdr);
3168
3169 nla_put_failure:
3170         genlmsg_cancel(skb, hdr);
3171         return -EMSGSIZE;
3172 }
3173
3174 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
3175                                  struct netlink_callback *cb)
3176 {
3177         int idx = 0;
3178         int start = cb->args[0];
3179         struct ip_vs_service *svc;
3180         struct ip_vs_dest *dest;
3181         struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
3182         struct net *net = skb_sknet(skb);
3183
3184         mutex_lock(&__ip_vs_mutex);
3185
3186         /* Try to find the service for which to dump destinations */
3187         if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
3188                         IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
3189                 goto out_err;
3190
3191
3192         svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
3193         if (IS_ERR(svc) || svc == NULL)
3194                 goto out_err;
3195
3196         /* Dump the destinations */
3197         list_for_each_entry(dest, &svc->destinations, n_list) {
3198                 if (++idx <= start)
3199                         continue;
3200                 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
3201                         idx--;
3202                         goto nla_put_failure;
3203                 }
3204         }
3205
3206 nla_put_failure:
3207         cb->args[0] = idx;
3208
3209 out_err:
3210         mutex_unlock(&__ip_vs_mutex);
3211
3212         return skb->len;
3213 }
3214
3215 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3216                                  struct nlattr *nla, int full_entry)
3217 {
3218         struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3219         struct nlattr *nla_addr, *nla_port;
3220
3221         /* Parse mandatory identifying destination fields first */
3222         if (nla == NULL ||
3223             nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3224                 return -EINVAL;
3225
3226         nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
3227         nla_port        = attrs[IPVS_DEST_ATTR_PORT];
3228
3229         if (!(nla_addr && nla_port))
3230                 return -EINVAL;
3231
3232         memset(udest, 0, sizeof(*udest));
3233
3234         nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3235         udest->port = nla_get_u16(nla_port);
3236
3237         /* If a full entry was requested, check for the additional fields */
3238         if (full_entry) {
3239                 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3240                               *nla_l_thresh;
3241
3242                 nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3243                 nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
3244                 nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
3245                 nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
3246
3247                 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3248                         return -EINVAL;
3249
3250                 udest->conn_flags = nla_get_u32(nla_fwd)
3251                                     & IP_VS_CONN_F_FWD_MASK;
3252                 udest->weight = nla_get_u32(nla_weight);
3253                 udest->u_threshold = nla_get_u32(nla_u_thresh);
3254                 udest->l_threshold = nla_get_u32(nla_l_thresh);
3255         }
3256
3257         return 0;
3258 }
3259
3260 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
3261                                   const char *mcast_ifn, __be32 syncid)
3262 {
3263         struct nlattr *nl_daemon;
3264
3265         nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3266         if (!nl_daemon)
3267                 return -EMSGSIZE;
3268
3269         if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
3270             nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
3271             nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
3272                 goto nla_put_failure;
3273         nla_nest_end(skb, nl_daemon);
3274
3275         return 0;
3276
3277 nla_put_failure:
3278         nla_nest_cancel(skb, nl_daemon);
3279         return -EMSGSIZE;
3280 }
3281
3282 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
3283                                   const char *mcast_ifn, __be32 syncid,
3284                                   struct netlink_callback *cb)
3285 {
3286         void *hdr;
3287         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3288                           &ip_vs_genl_family, NLM_F_MULTI,
3289                           IPVS_CMD_NEW_DAEMON);
3290         if (!hdr)
3291                 return -EMSGSIZE;
3292
3293         if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
3294                 goto nla_put_failure;
3295
3296         return genlmsg_end(skb, hdr);
3297
3298 nla_put_failure:
3299         genlmsg_cancel(skb, hdr);
3300         return -EMSGSIZE;
3301 }
3302
3303 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3304                                    struct netlink_callback *cb)
3305 {
3306         struct net *net = skb_sknet(skb);
3307         struct netns_ipvs *ipvs = net_ipvs(net);
3308
3309         mutex_lock(&ipvs->sync_mutex);
3310         if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3311                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3312                                            ipvs->master_mcast_ifn,
3313                                            ipvs->master_syncid, cb) < 0)
3314                         goto nla_put_failure;
3315
3316                 cb->args[0] = 1;
3317         }
3318
3319         if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3320                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3321                                            ipvs->backup_mcast_ifn,
3322                                            ipvs->backup_syncid, cb) < 0)
3323                         goto nla_put_failure;
3324
3325                 cb->args[1] = 1;
3326         }
3327
3328 nla_put_failure:
3329         mutex_unlock(&ipvs->sync_mutex);
3330
3331         return skb->len;
3332 }
3333
3334 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3335 {
3336         if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3337               attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3338               attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3339                 return -EINVAL;
3340
3341         return start_sync_thread(net,
3342                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3343                                  nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3344                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3345 }
3346
3347 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3348 {
3349         if (!attrs[IPVS_DAEMON_ATTR_STATE])
3350                 return -EINVAL;
3351
3352         return stop_sync_thread(net,
3353                                 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3354 }
3355
3356 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3357 {
3358         struct ip_vs_timeout_user t;
3359
3360         __ip_vs_get_timeouts(net, &t);
3361
3362         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3363                 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3364
3365         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3366                 t.tcp_fin_timeout =
3367                         nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3368
3369         if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3370                 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3371
3372         return ip_vs_set_timeout(net, &t);
3373 }
3374
3375 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
3376 {
3377         int ret = 0, cmd;
3378         struct net *net;
3379         struct netns_ipvs *ipvs;
3380
3381         net = skb_sknet(skb);
3382         ipvs = net_ipvs(net);
3383         cmd = info->genlhdr->cmd;
3384
3385         if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
3386                 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3387
3388                 mutex_lock(&ipvs->sync_mutex);
3389                 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3390                     nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3391                                      info->attrs[IPVS_CMD_ATTR_DAEMON],
3392                                      ip_vs_daemon_policy)) {
3393                         ret = -EINVAL;
3394                         goto out;
3395                 }
3396
3397                 if (cmd == IPVS_CMD_NEW_DAEMON)
3398                         ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3399                 else
3400                         ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3401 out:
3402                 mutex_unlock(&ipvs->sync_mutex);
3403         }
3404         return ret;
3405 }
3406
3407 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3408 {
3409         struct ip_vs_service *svc = NULL;
3410         struct ip_vs_service_user_kern usvc;
3411         struct ip_vs_dest_user_kern udest;
3412         int ret = 0, cmd;
3413         int need_full_svc = 0, need_full_dest = 0;
3414         struct net *net;
3415
3416         net = skb_sknet(skb);
3417         cmd = info->genlhdr->cmd;
3418
3419         mutex_lock(&__ip_vs_mutex);
3420
3421         if (cmd == IPVS_CMD_FLUSH) {
3422                 ret = ip_vs_flush(net);
3423                 goto out;
3424         } else if (cmd == IPVS_CMD_SET_CONFIG) {
3425                 ret = ip_vs_genl_set_config(net, info->attrs);
3426                 goto out;
3427         } else if (cmd == IPVS_CMD_ZERO &&
3428                    !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3429                 ret = ip_vs_zero_all(net);
3430                 goto out;
3431         }
3432
3433         /* All following commands require a service argument, so check if we
3434          * received a valid one. We need a full service specification when
3435          * adding / editing a service. Only identifying members otherwise. */
3436         if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3437                 need_full_svc = 1;
3438
3439         ret = ip_vs_genl_parse_service(net, &usvc,
3440                                        info->attrs[IPVS_CMD_ATTR_SERVICE],
3441                                        need_full_svc, &svc);
3442         if (ret)
3443                 goto out;
3444
3445         /* Unless we're adding a new service, the service must already exist */
3446         if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3447                 ret = -ESRCH;
3448                 goto out;
3449         }
3450
3451         /* Destination commands require a valid destination argument. For
3452          * adding / editing a destination, we need a full destination
3453          * specification. */
3454         if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3455             cmd == IPVS_CMD_DEL_DEST) {
3456                 if (cmd != IPVS_CMD_DEL_DEST)
3457                         need_full_dest = 1;
3458
3459                 ret = ip_vs_genl_parse_dest(&udest,
3460                                             info->attrs[IPVS_CMD_ATTR_DEST],
3461                                             need_full_dest);
3462                 if (ret)
3463                         goto out;
3464         }
3465
3466         switch (cmd) {
3467         case IPVS_CMD_NEW_SERVICE:
3468                 if (svc == NULL)
3469                         ret = ip_vs_add_service(net, &usvc, &svc);
3470                 else
3471                         ret = -EEXIST;
3472                 break;
3473         case IPVS_CMD_SET_SERVICE:
3474                 ret = ip_vs_edit_service(svc, &usvc);
3475                 break;
3476         case IPVS_CMD_DEL_SERVICE:
3477                 ret = ip_vs_del_service(svc);
3478                 /* do not use svc, it can be freed */
3479                 break;
3480         case IPVS_CMD_NEW_DEST:
3481                 ret = ip_vs_add_dest(svc, &udest);
3482                 break;
3483         case IPVS_CMD_SET_DEST:
3484                 ret = ip_vs_edit_dest(svc, &udest);
3485                 break;
3486         case IPVS_CMD_DEL_DEST:
3487                 ret = ip_vs_del_dest(svc, &udest);
3488                 break;
3489         case IPVS_CMD_ZERO:
3490                 ret = ip_vs_zero_service(svc);
3491                 break;
3492         default:
3493                 ret = -EINVAL;
3494         }
3495
3496 out:
3497         mutex_unlock(&__ip_vs_mutex);
3498
3499         return ret;
3500 }
3501
3502 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3503 {
3504         struct sk_buff *msg;
3505         void *reply;
3506         int ret, cmd, reply_cmd;
3507         struct net *net;
3508
3509         net = skb_sknet(skb);
3510         cmd = info->genlhdr->cmd;
3511
3512         if (cmd == IPVS_CMD_GET_SERVICE)
3513                 reply_cmd = IPVS_CMD_NEW_SERVICE;
3514         else if (cmd == IPVS_CMD_GET_INFO)
3515                 reply_cmd = IPVS_CMD_SET_INFO;
3516         else if (cmd == IPVS_CMD_GET_CONFIG)
3517                 reply_cmd = IPVS_CMD_SET_CONFIG;
3518         else {
3519                 pr_err("unknown Generic Netlink command\n");
3520                 return -EINVAL;
3521         }
3522
3523         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3524         if (!msg)
3525                 return -ENOMEM;
3526
3527         mutex_lock(&__ip_vs_mutex);
3528
3529         reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3530         if (reply == NULL)
3531                 goto nla_put_failure;
3532
3533         switch (cmd) {
3534         case IPVS_CMD_GET_SERVICE:
3535         {
3536                 struct ip_vs_service *svc;
3537
3538                 svc = ip_vs_genl_find_service(net,
3539                                               info->attrs[IPVS_CMD_ATTR_SERVICE]);
3540                 if (IS_ERR(svc)) {
3541                         ret = PTR_ERR(svc);
3542                         goto out_err;
3543                 } else if (svc) {
3544                         ret = ip_vs_genl_fill_service(msg, svc);
3545                         if (ret)
3546                                 goto nla_put_failure;
3547                 } else {
3548                         ret = -ESRCH;
3549                         goto out_err;
3550                 }
3551
3552                 break;
3553         }
3554
3555         case IPVS_CMD_GET_CONFIG:
3556         {
3557                 struct ip_vs_timeout_user t;
3558
3559                 __ip_vs_get_timeouts(net, &t);
3560 #ifdef CONFIG_IP_VS_PROTO_TCP
3561                 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
3562                                 t.tcp_timeout) ||
3563                     nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3564                                 t.tcp_fin_timeout))
3565                         goto nla_put_failure;
3566 #endif
3567 #ifdef CONFIG_IP_VS_PROTO_UDP
3568                 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
3569                         goto nla_put_failure;
3570 #endif
3571
3572                 break;
3573         }
3574
3575         case IPVS_CMD_GET_INFO:
3576                 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
3577                                 IP_VS_VERSION_CODE) ||
3578                     nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3579                                 ip_vs_conn_tab_size))
3580                         goto nla_put_failure;
3581                 break;
3582         }
3583
3584         genlmsg_end(msg, reply);
3585         ret = genlmsg_reply(msg, info);
3586         goto out;
3587
3588 nla_put_failure:
3589         pr_err("not enough space in Netlink message\n");
3590         ret = -EMSGSIZE;
3591
3592 out_err:
3593         nlmsg_free(msg);
3594 out:
3595         mutex_unlock(&__ip_vs_mutex);
3596
3597         return ret;
3598 }
3599
3600
3601 static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3602         {
3603                 .cmd    = IPVS_CMD_NEW_SERVICE,
3604                 .flags  = GENL_ADMIN_PERM,
3605                 .policy = ip_vs_cmd_policy,
3606                 .doit   = ip_vs_genl_set_cmd,
3607         },
3608         {
3609                 .cmd    = IPVS_CMD_SET_SERVICE,
3610                 .flags  = GENL_ADMIN_PERM,
3611                 .policy = ip_vs_cmd_policy,
3612                 .doit   = ip_vs_genl_set_cmd,
3613         },
3614         {
3615                 .cmd    = IPVS_CMD_DEL_SERVICE,
3616                 .flags  = GENL_ADMIN_PERM,
3617                 .policy = ip_vs_cmd_policy,
3618                 .doit   = ip_vs_genl_set_cmd,
3619         },
3620         {
3621                 .cmd    = IPVS_CMD_GET_SERVICE,
3622                 .flags  = GENL_ADMIN_PERM,
3623                 .doit   = ip_vs_genl_get_cmd,
3624                 .dumpit = ip_vs_genl_dump_services,
3625                 .policy = ip_vs_cmd_policy,
3626         },
3627         {
3628                 .cmd    = IPVS_CMD_NEW_DEST,
3629                 .flags  = GENL_ADMIN_PERM,
3630                 .policy = ip_vs_cmd_policy,
3631                 .doit   = ip_vs_genl_set_cmd,
3632         },
3633         {
3634                 .cmd    = IPVS_CMD_SET_DEST,
3635                 .flags  = GENL_ADMIN_PERM,
3636                 .policy = ip_vs_cmd_policy,
3637                 .doit   = ip_vs_genl_set_cmd,
3638         },
3639         {
3640                 .cmd    = IPVS_CMD_DEL_DEST,
3641                 .flags  = GENL_ADMIN_PERM,
3642                 .policy = ip_vs_cmd_policy,
3643                 .doit   = ip_vs_genl_set_cmd,
3644         },
3645         {
3646                 .cmd    = IPVS_CMD_GET_DEST,
3647                 .flags  = GENL_ADMIN_PERM,
3648                 .policy = ip_vs_cmd_policy,
3649                 .dumpit = ip_vs_genl_dump_dests,
3650         },
3651         {
3652                 .cmd    = IPVS_CMD_NEW_DAEMON,
3653                 .flags  = GENL_ADMIN_PERM,
3654                 .policy = ip_vs_cmd_policy,
3655                 .doit   = ip_vs_genl_set_daemon,
3656         },
3657         {
3658                 .cmd    = IPVS_CMD_DEL_DAEMON,
3659                 .flags  = GENL_ADMIN_PERM,
3660                 .policy = ip_vs_cmd_policy,
3661                 .doit   = ip_vs_genl_set_daemon,
3662         },
3663         {
3664                 .cmd    = IPVS_CMD_GET_DAEMON,
3665                 .flags  = GENL_ADMIN_PERM,
3666                 .dumpit = ip_vs_genl_dump_daemons,
3667         },
3668         {
3669                 .cmd    = IPVS_CMD_SET_CONFIG,
3670                 .flags  = GENL_ADMIN_PERM,
3671                 .policy = ip_vs_cmd_policy,
3672                 .doit   = ip_vs_genl_set_cmd,
3673         },
3674         {
3675                 .cmd    = IPVS_CMD_GET_CONFIG,
3676                 .flags  = GENL_ADMIN_PERM,
3677                 .doit   = ip_vs_genl_get_cmd,
3678         },
3679         {
3680                 .cmd    = IPVS_CMD_GET_INFO,
3681                 .flags  = GENL_ADMIN_PERM,
3682                 .doit   = ip_vs_genl_get_cmd,
3683         },
3684         {
3685                 .cmd    = IPVS_CMD_ZERO,
3686                 .flags  = GENL_ADMIN_PERM,
3687                 .policy = ip_vs_cmd_policy,
3688                 .doit   = ip_vs_genl_set_cmd,
3689         },
3690         {
3691                 .cmd    = IPVS_CMD_FLUSH,
3692                 .flags  = GENL_ADMIN_PERM,
3693                 .doit   = ip_vs_genl_set_cmd,
3694         },
3695 };
3696
3697 static int __init ip_vs_genl_register(void)
3698 {
3699         return genl_register_family_with_ops(&ip_vs_genl_family,
3700                 ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
3701 }
3702
3703 static void ip_vs_genl_unregister(void)
3704 {
3705         genl_unregister_family(&ip_vs_genl_family);
3706 }
3707
3708 /* End of Generic Netlink interface definitions */
3709
3710 /*
3711  * per netns intit/exit func.
3712  */
3713 #ifdef CONFIG_SYSCTL
3714 static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
3715 {
3716         int idx;
3717         struct netns_ipvs *ipvs = net_ipvs(net);
3718         struct ctl_table *tbl;
3719
3720         atomic_set(&ipvs->dropentry, 0);
3721         spin_lock_init(&ipvs->dropentry_lock);
3722         spin_lock_init(&ipvs->droppacket_lock);
3723         spin_lock_init(&ipvs->securetcp_lock);
3724
3725         if (!net_eq(net, &init_net)) {
3726                 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3727                 if (tbl == NULL)
3728                         return -ENOMEM;
3729
3730                 /* Don't export sysctls to unprivileged users */
3731                 if (net->user_ns != &init_user_ns)
3732                         tbl[0].procname = NULL;
3733         } else
3734                 tbl = vs_vars;
3735         /* Initialize sysctl defaults */
3736         idx = 0;
3737         ipvs->sysctl_amemthresh = 1024;
3738         tbl[idx++].data = &ipvs->sysctl_amemthresh;
3739         ipvs->sysctl_am_droprate = 10;
3740         tbl[idx++].data = &ipvs->sysctl_am_droprate;
3741         tbl[idx++].data = &ipvs->sysctl_drop_entry;
3742         tbl[idx++].data = &ipvs->sysctl_drop_packet;
3743 #ifdef CONFIG_IP_VS_NFCT
3744         tbl[idx++].data = &ipvs->sysctl_conntrack;
3745 #endif
3746         tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3747         ipvs->sysctl_snat_reroute = 1;
3748         tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3749         ipvs->sysctl_sync_ver = 1;
3750         tbl[idx++].data = &ipvs->sysctl_sync_ver;
3751         ipvs->sysctl_sync_ports = 1;
3752         tbl[idx++].data = &ipvs->sysctl_sync_ports;
3753         ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
3754         tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
3755         ipvs->sysctl_sync_sock_size = 0;
3756         tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
3757         tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3758         tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3759         tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3760         ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
3761         ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
3762         tbl[idx].data = &ipvs->sysctl_sync_threshold;
3763         tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3764         ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
3765         tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
3766         ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
3767         tbl[idx++].data = &ipvs->sysctl_sync_retries;
3768         tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3769         ipvs->sysctl_pmtu_disc = 1;
3770         tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
3771         tbl[idx++].data = &ipvs->sysctl_backup_only;
3772
3773
3774         ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
3775         if (ipvs->sysctl_hdr == NULL) {
3776                 if (!net_eq(net, &init_net))
3777                         kfree(tbl);
3778                 return -ENOMEM;
3779         }
3780         ip_vs_start_estimator(net, &ipvs->tot_stats);
3781         ipvs->sysctl_tbl = tbl;
3782         /* Schedule defense work */
3783         INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3784         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3785
3786         return 0;
3787 }
3788
3789 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
3790 {
3791         struct netns_ipvs *ipvs = net_ipvs(net);
3792
3793         cancel_delayed_work_sync(&ipvs->defense_work);
3794         cancel_work_sync(&ipvs->defense_work.work);
3795         unregister_net_sysctl_table(ipvs->sysctl_hdr);
3796 }
3797
3798 #else
3799
3800 static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
3801 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
3802
3803 #endif
3804
3805 static struct notifier_block ip_vs_dst_notifier = {
3806         .notifier_call = ip_vs_dst_event,
3807 };
3808
3809 int __net_init ip_vs_control_net_init(struct net *net)
3810 {
3811         int idx;
3812         struct netns_ipvs *ipvs = net_ipvs(net);
3813
3814         rwlock_init(&ipvs->rs_lock);
3815
3816         /* Initialize rs_table */
3817         for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3818                 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3819
3820         INIT_LIST_HEAD(&ipvs->dest_trash);
3821         atomic_set(&ipvs->ftpsvc_counter, 0);
3822         atomic_set(&ipvs->nullsvc_counter, 0);
3823
3824         /* procfs stats */
3825         ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3826         if (!ipvs->tot_stats.cpustats)
3827                 return -ENOMEM;
3828
3829         spin_lock_init(&ipvs->tot_stats.lock);
3830
3831         proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
3832         proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
3833         proc_create("ip_vs_stats_percpu", 0, net->proc_net,
3834                     &ip_vs_stats_percpu_fops);
3835
3836         if (ip_vs_control_net_init_sysctl(net))
3837                 goto err;
3838
3839         return 0;
3840
3841 err:
3842         free_percpu(ipvs->tot_stats.cpustats);
3843         return -ENOMEM;
3844 }
3845
3846 void __net_exit ip_vs_control_net_cleanup(struct net *net)
3847 {
3848         struct netns_ipvs *ipvs = net_ipvs(net);
3849
3850         ip_vs_trash_cleanup(net);
3851         ip_vs_stop_estimator(net, &ipvs->tot_stats);
3852         ip_vs_control_net_cleanup_sysctl(net);
3853         remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
3854         remove_proc_entry("ip_vs_stats", net->proc_net);
3855         remove_proc_entry("ip_vs", net->proc_net);
3856         free_percpu(ipvs->tot_stats.cpustats);
3857 }
3858
3859 int __init ip_vs_register_nl_ioctl(void)
3860 {
3861         int ret;
3862
3863         ret = nf_register_sockopt(&ip_vs_sockopts);
3864         if (ret) {
3865                 pr_err("cannot register sockopt.\n");
3866                 goto err_sock;
3867         }
3868
3869         ret = ip_vs_genl_register();
3870         if (ret) {
3871                 pr_err("cannot register Generic Netlink interface.\n");
3872                 goto err_genl;
3873         }
3874         return 0;
3875
3876 err_genl:
3877         nf_unregister_sockopt(&ip_vs_sockopts);
3878 err_sock:
3879         return ret;
3880 }
3881
3882 void ip_vs_unregister_nl_ioctl(void)
3883 {
3884         ip_vs_genl_unregister();
3885         nf_unregister_sockopt(&ip_vs_sockopts);
3886 }
3887
3888 int __init ip_vs_control_init(void)
3889 {
3890         int idx;
3891         int ret;
3892
3893         EnterFunction(2);
3894
3895         /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
3896         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3897                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3898                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3899         }
3900
3901         smp_wmb();      /* Do we really need it now ? */
3902
3903         ret = register_netdevice_notifier(&ip_vs_dst_notifier);
3904         if (ret < 0)
3905                 return ret;
3906
3907         LeaveFunction(2);
3908         return 0;
3909 }
3910
3911
3912 void ip_vs_control_cleanup(void)
3913 {
3914         EnterFunction(2);
3915         unregister_netdevice_notifier(&ip_vs_dst_notifier);
3916         LeaveFunction(2);
3917 }