2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
43 #ifdef CONFIG_IP_VS_IPV6
45 #include <net/ip6_route.h>
47 #include <net/route.h>
49 #include <net/genetlink.h>
51 #include <asm/uaccess.h>
53 #include <net/ip_vs.h>
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
61 /* sysctl variables */
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
66 int ip_vs_get_debug_level(void)
68 return sysctl_ip_vs_debug_level;
74 static void __ip_vs_del_service(struct ip_vs_service *svc);
77 #ifdef CONFIG_IP_VS_IPV6
78 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
79 static bool __ip_vs_addr_is_local_v6(struct net *net,
80 const struct in6_addr *addr)
85 struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
88 is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
97 * update_defense_level is called from keventd and from sysctl,
98 * so it needs to protect itself from softirqs
100 static void update_defense_level(struct netns_ipvs *ipvs)
103 static int old_secure_tcp = 0;
108 /* we only count free and buffered memory (in pages) */
110 availmem = i.freeram + i.bufferram;
111 /* however in linux 2.5 the i.bufferram is total page cache size,
113 /* si_swapinfo(&i); */
114 /* availmem = availmem - (i.totalswap - i.freeswap); */
116 nomem = (availmem < ipvs->sysctl_amemthresh);
121 spin_lock(&ipvs->dropentry_lock);
122 switch (ipvs->sysctl_drop_entry) {
124 atomic_set(&ipvs->dropentry, 0);
128 atomic_set(&ipvs->dropentry, 1);
129 ipvs->sysctl_drop_entry = 2;
131 atomic_set(&ipvs->dropentry, 0);
136 atomic_set(&ipvs->dropentry, 1);
138 atomic_set(&ipvs->dropentry, 0);
139 ipvs->sysctl_drop_entry = 1;
143 atomic_set(&ipvs->dropentry, 1);
146 spin_unlock(&ipvs->dropentry_lock);
149 spin_lock(&ipvs->droppacket_lock);
150 switch (ipvs->sysctl_drop_packet) {
156 ipvs->drop_rate = ipvs->drop_counter
157 = ipvs->sysctl_amemthresh /
158 (ipvs->sysctl_amemthresh-availmem);
159 ipvs->sysctl_drop_packet = 2;
166 ipvs->drop_rate = ipvs->drop_counter
167 = ipvs->sysctl_amemthresh /
168 (ipvs->sysctl_amemthresh-availmem);
171 ipvs->sysctl_drop_packet = 1;
175 ipvs->drop_rate = ipvs->sysctl_am_droprate;
178 spin_unlock(&ipvs->droppacket_lock);
181 spin_lock(&ipvs->securetcp_lock);
182 switch (ipvs->sysctl_secure_tcp) {
184 if (old_secure_tcp >= 2)
189 if (old_secure_tcp < 2)
191 ipvs->sysctl_secure_tcp = 2;
193 if (old_secure_tcp >= 2)
199 if (old_secure_tcp < 2)
202 if (old_secure_tcp >= 2)
204 ipvs->sysctl_secure_tcp = 1;
208 if (old_secure_tcp < 2)
212 old_secure_tcp = ipvs->sysctl_secure_tcp;
214 ip_vs_protocol_timeout_change(ipvs,
215 ipvs->sysctl_secure_tcp > 1);
216 spin_unlock(&ipvs->securetcp_lock);
223 * Timer for checking the defense
225 #define DEFENSE_TIMER_PERIOD 1*HZ
227 static void defense_work_handler(struct work_struct *work)
229 struct netns_ipvs *ipvs =
230 container_of(work, struct netns_ipvs, defense_work.work);
232 update_defense_level(ipvs);
233 if (atomic_read(&ipvs->dropentry))
234 ip_vs_random_dropentry(ipvs->net);
235 schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
240 ip_vs_use_count_inc(void)
242 return try_module_get(THIS_MODULE);
246 ip_vs_use_count_dec(void)
248 module_put(THIS_MODULE);
253 * Hash table: for virtual service lookups
255 #define IP_VS_SVC_TAB_BITS 8
256 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
257 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
259 /* the service table hashed by <protocol, addr, port> */
260 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
261 /* the service table hashed by fwmark */
262 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
266 * Returns hash value for virtual service
268 static inline unsigned int
269 ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
270 const union nf_inet_addr *addr, __be16 port)
272 register unsigned int porth = ntohs(port);
273 __be32 addr_fold = addr->ip;
276 #ifdef CONFIG_IP_VS_IPV6
278 addr_fold = addr->ip6[0]^addr->ip6[1]^
279 addr->ip6[2]^addr->ip6[3];
281 ahash = ntohl(addr_fold);
282 ahash ^= ((size_t) net >> 8);
284 return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
289 * Returns hash value of fwmark for virtual service lookup
291 static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
293 return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
297 * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
298 * or in the ip_vs_svc_fwm_table by fwmark.
299 * Should be called with locked tables.
301 static int ip_vs_svc_hash(struct ip_vs_service *svc)
305 if (svc->flags & IP_VS_SVC_F_HASHED) {
306 pr_err("%s(): request for already hashed, called from %pF\n",
307 __func__, __builtin_return_address(0));
311 if (svc->fwmark == 0) {
313 * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
315 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
316 &svc->addr, svc->port);
317 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
320 * Hash it by fwmark in svc_fwm_table
322 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
323 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
326 svc->flags |= IP_VS_SVC_F_HASHED;
327 /* increase its refcnt because it is referenced by the svc table */
328 atomic_inc(&svc->refcnt);
334 * Unhashes a service from svc_table / svc_fwm_table.
335 * Should be called with locked tables.
337 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
339 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
340 pr_err("%s(): request for unhash flagged, called from %pF\n",
341 __func__, __builtin_return_address(0));
345 if (svc->fwmark == 0) {
346 /* Remove it from the svc_table table */
347 list_del(&svc->s_list);
349 /* Remove it from the svc_fwm_table table */
350 list_del(&svc->f_list);
353 svc->flags &= ~IP_VS_SVC_F_HASHED;
354 atomic_dec(&svc->refcnt);
360 * Get service by {netns, proto,addr,port} in the service table.
362 static inline struct ip_vs_service *
363 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
364 const union nf_inet_addr *vaddr, __be16 vport)
367 struct ip_vs_service *svc;
369 /* Check for "full" addressed entries */
370 hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
372 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
374 && ip_vs_addr_equal(af, &svc->addr, vaddr)
375 && (svc->port == vport)
376 && (svc->protocol == protocol)
377 && net_eq(svc->net, net)) {
388 * Get service by {fwmark} in the service table.
390 static inline struct ip_vs_service *
391 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
394 struct ip_vs_service *svc;
396 /* Check for fwmark addressed entries */
397 hash = ip_vs_svc_fwm_hashkey(net, fwmark);
399 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400 if (svc->fwmark == fwmark && svc->af == af
401 && net_eq(svc->net, net)) {
410 struct ip_vs_service *
411 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
412 const union nf_inet_addr *vaddr, __be16 vport)
414 struct ip_vs_service *svc;
415 struct netns_ipvs *ipvs = net_ipvs(net);
417 read_lock(&__ip_vs_svc_lock);
420 * Check the table hashed by fwmark first
423 svc = __ip_vs_svc_fwm_find(net, af, fwmark);
429 * Check the table hashed by <protocol,addr,port>
430 * for "full" addressed entries
432 svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
435 && protocol == IPPROTO_TCP
436 && atomic_read(&ipvs->ftpsvc_counter)
437 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
439 * Check if ftp service entry exists, the packet
440 * might belong to FTP data connections.
442 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
446 && atomic_read(&ipvs->nullsvc_counter)) {
448 * Check if the catch-all port (port zero) exists
450 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
455 atomic_inc(&svc->usecnt);
456 read_unlock(&__ip_vs_svc_lock);
458 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
459 fwmark, ip_vs_proto_name(protocol),
460 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
461 svc ? "hit" : "not hit");
468 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
470 atomic_inc(&svc->refcnt);
475 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
477 struct ip_vs_service *svc = dest->svc;
480 if (atomic_dec_and_test(&svc->refcnt)) {
481 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
483 IP_VS_DBG_ADDR(svc->af, &svc->addr),
484 ntohs(svc->port), atomic_read(&svc->usecnt));
485 free_percpu(svc->stats.cpustats);
492 * Returns hash value for real service
494 static inline unsigned int ip_vs_rs_hashkey(int af,
495 const union nf_inet_addr *addr,
498 register unsigned int porth = ntohs(port);
499 __be32 addr_fold = addr->ip;
501 #ifdef CONFIG_IP_VS_IPV6
503 addr_fold = addr->ip6[0]^addr->ip6[1]^
504 addr->ip6[2]^addr->ip6[3];
507 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
512 * Hashes ip_vs_dest in rs_table by <proto,addr,port>.
513 * should be called with locked tables.
515 static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
519 if (!list_empty(&dest->d_list)) {
524 * Hash by proto,addr,port,
525 * which are the parameters of the real service.
527 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
529 list_add(&dest->d_list, &ipvs->rs_table[hash]);
535 * UNhashes ip_vs_dest from rs_table.
536 * should be called with locked tables.
538 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
541 * Remove it from the rs_table table.
543 if (!list_empty(&dest->d_list)) {
544 list_del_init(&dest->d_list);
551 * Lookup real service by <proto,addr,port> in the real service table.
554 ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
555 const union nf_inet_addr *daddr,
558 struct netns_ipvs *ipvs = net_ipvs(net);
560 struct ip_vs_dest *dest;
563 * Check for "full" addressed entries
564 * Return the first found entry
566 hash = ip_vs_rs_hashkey(af, daddr, dport);
568 read_lock(&ipvs->rs_lock);
569 list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
571 && ip_vs_addr_equal(af, &dest->addr, daddr)
572 && (dest->port == dport)
573 && ((dest->protocol == protocol) ||
576 read_unlock(&ipvs->rs_lock);
580 read_unlock(&ipvs->rs_lock);
586 * Lookup destination by {addr,port} in the given service
588 static struct ip_vs_dest *
589 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
592 struct ip_vs_dest *dest;
595 * Find the destination for the given service
597 list_for_each_entry(dest, &svc->destinations, n_list) {
598 if ((dest->af == svc->af)
599 && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
600 && (dest->port == dport)) {
610 * Find destination by {daddr,dport,vaddr,protocol}
611 * Cretaed to be used in ip_vs_process_message() in
612 * the backup synchronization daemon. It finds the
613 * destination to be bound to the received connection
616 * ip_vs_lookup_real_service() looked promissing, but
617 * seems not working as expected.
619 struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
620 const union nf_inet_addr *daddr,
622 const union nf_inet_addr *vaddr,
623 __be16 vport, __u16 protocol, __u32 fwmark,
626 struct ip_vs_dest *dest;
627 struct ip_vs_service *svc;
630 svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
633 if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
635 dest = ip_vs_lookup_dest(svc, daddr, port);
637 dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
639 atomic_inc(&dest->refcnt);
640 ip_vs_service_put(svc);
644 void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
646 struct ip_vs_dest_dst *dest_dst = container_of(head,
647 struct ip_vs_dest_dst,
650 dst_release(dest_dst->dst_cache);
654 /* Release dest_dst and dst_cache for dest in user context */
655 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
657 struct ip_vs_dest_dst *old;
659 old = rcu_dereference_protected(dest->dest_dst, 1);
661 RCU_INIT_POINTER(dest->dest_dst, NULL);
662 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
667 * Lookup dest by {svc,addr,port} in the destination trash.
668 * The destination trash is used to hold the destinations that are removed
669 * from the service table but are still referenced by some conn entries.
670 * The reason to add the destination trash is when the dest is temporary
671 * down (either by administrator or by monitor program), the dest can be
672 * picked back from the trash, the remaining connections to the dest can
673 * continue, and the counting information of the dest is also useful for
676 static struct ip_vs_dest *
677 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
680 struct ip_vs_dest *dest, *nxt;
681 struct netns_ipvs *ipvs = net_ipvs(svc->net);
684 * Find the destination in trash
686 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
687 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
690 IP_VS_DBG_ADDR(svc->af, &dest->addr),
692 atomic_read(&dest->refcnt));
693 if (dest->af == svc->af &&
694 ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
695 dest->port == dport &&
696 dest->vfwmark == svc->fwmark &&
697 dest->protocol == svc->protocol &&
699 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
700 dest->vport == svc->port))) {
706 * Try to purge the destination from trash if not referenced
708 if (atomic_read(&dest->refcnt) == 1) {
709 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
712 IP_VS_DBG_ADDR(svc->af, &dest->addr),
714 list_del(&dest->n_list);
715 __ip_vs_dst_cache_reset(dest);
716 __ip_vs_unbind_svc(dest);
717 free_percpu(dest->stats.cpustats);
727 * Clean up all the destinations in the trash
728 * Called by the ip_vs_control_cleanup()
730 * When the ip_vs_control_clearup is activated by ipvs module exit,
731 * the service tables must have been flushed and all the connections
732 * are expired, and the refcnt of each destination in the trash must
733 * be 1, so we simply release them here.
735 static void ip_vs_trash_cleanup(struct net *net)
737 struct ip_vs_dest *dest, *nxt;
738 struct netns_ipvs *ipvs = net_ipvs(net);
740 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
741 list_del(&dest->n_list);
742 __ip_vs_dst_cache_reset(dest);
743 __ip_vs_unbind_svc(dest);
744 free_percpu(dest->stats.cpustats);
750 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
752 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
754 spin_lock_bh(&src->lock);
756 IP_VS_SHOW_STATS_COUNTER(conns);
757 IP_VS_SHOW_STATS_COUNTER(inpkts);
758 IP_VS_SHOW_STATS_COUNTER(outpkts);
759 IP_VS_SHOW_STATS_COUNTER(inbytes);
760 IP_VS_SHOW_STATS_COUNTER(outbytes);
762 ip_vs_read_estimator(dst, src);
764 spin_unlock_bh(&src->lock);
768 ip_vs_zero_stats(struct ip_vs_stats *stats)
770 spin_lock_bh(&stats->lock);
772 /* get current counters as zero point, rates are zeroed */
774 #define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
776 IP_VS_ZERO_STATS_COUNTER(conns);
777 IP_VS_ZERO_STATS_COUNTER(inpkts);
778 IP_VS_ZERO_STATS_COUNTER(outpkts);
779 IP_VS_ZERO_STATS_COUNTER(inbytes);
780 IP_VS_ZERO_STATS_COUNTER(outbytes);
782 ip_vs_zero_estimator(stats);
784 spin_unlock_bh(&stats->lock);
788 * Update a destination in the given service
791 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
792 struct ip_vs_dest_user_kern *udest, int add)
794 struct netns_ipvs *ipvs = net_ipvs(svc->net);
797 /* set the weight and the flags */
798 atomic_set(&dest->weight, udest->weight);
799 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
800 conn_flags |= IP_VS_CONN_F_INACTIVE;
802 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
803 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
804 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
807 * Put the real service in rs_table if not present.
808 * For now only for NAT!
810 write_lock_bh(&ipvs->rs_lock);
811 ip_vs_rs_hash(ipvs, dest);
812 write_unlock_bh(&ipvs->rs_lock);
814 atomic_set(&dest->conn_flags, conn_flags);
816 /* bind the service */
818 __ip_vs_bind_svc(dest, svc);
820 if (dest->svc != svc) {
821 __ip_vs_unbind_svc(dest);
822 ip_vs_zero_stats(&dest->stats);
823 __ip_vs_bind_svc(dest, svc);
827 /* set the dest status flags */
828 dest->flags |= IP_VS_DEST_F_AVAILABLE;
830 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
831 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
832 dest->u_threshold = udest->u_threshold;
833 dest->l_threshold = udest->l_threshold;
835 spin_lock_bh(&dest->dst_lock);
836 __ip_vs_dst_cache_reset(dest);
837 spin_unlock_bh(&dest->dst_lock);
840 ip_vs_start_estimator(svc->net, &dest->stats);
842 write_lock_bh(&__ip_vs_svc_lock);
844 /* Wait until all other svc users go away */
845 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
848 list_add(&dest->n_list, &svc->destinations);
852 /* call the update_service, because server weight may be changed */
853 if (svc->scheduler->update_service)
854 svc->scheduler->update_service(svc);
856 write_unlock_bh(&__ip_vs_svc_lock);
861 * Create a destination for the given service
864 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
865 struct ip_vs_dest **dest_p)
867 struct ip_vs_dest *dest;
872 #ifdef CONFIG_IP_VS_IPV6
873 if (svc->af == AF_INET6) {
874 atype = ipv6_addr_type(&udest->addr.in6);
875 if ((!(atype & IPV6_ADDR_UNICAST) ||
876 atype & IPV6_ADDR_LINKLOCAL) &&
877 !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
882 atype = inet_addr_type(svc->net, udest->addr.ip);
883 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
887 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
891 dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
892 if (!dest->stats.cpustats)
896 dest->protocol = svc->protocol;
897 dest->vaddr = svc->addr;
898 dest->vport = svc->port;
899 dest->vfwmark = svc->fwmark;
900 ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
901 dest->port = udest->port;
903 atomic_set(&dest->activeconns, 0);
904 atomic_set(&dest->inactconns, 0);
905 atomic_set(&dest->persistconns, 0);
906 atomic_set(&dest->refcnt, 1);
908 INIT_LIST_HEAD(&dest->d_list);
909 spin_lock_init(&dest->dst_lock);
910 spin_lock_init(&dest->stats.lock);
911 __ip_vs_update_dest(svc, dest, udest, 1);
925 * Add a destination into an existing service
928 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
930 struct ip_vs_dest *dest;
931 union nf_inet_addr daddr;
932 __be16 dport = udest->port;
937 if (udest->weight < 0) {
938 pr_err("%s(): server weight less than zero\n", __func__);
942 if (udest->l_threshold > udest->u_threshold) {
943 pr_err("%s(): lower threshold is higher than upper threshold\n",
948 ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
951 * Check if the dest already exists in the list
953 dest = ip_vs_lookup_dest(svc, &daddr, dport);
956 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
961 * Check if the dest already exists in the trash and
962 * is from the same service
964 dest = ip_vs_trash_get_dest(svc, &daddr, dport);
967 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
968 "dest->refcnt=%d, service %u/%s:%u\n",
969 IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
970 atomic_read(&dest->refcnt),
972 IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
976 * Get the destination from the trash
978 list_del(&dest->n_list);
980 __ip_vs_update_dest(svc, dest, udest, 1);
984 * Allocate and initialize the dest structure
986 ret = ip_vs_new_dest(svc, udest, &dest);
995 * Edit a destination in the given service
998 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1000 struct ip_vs_dest *dest;
1001 union nf_inet_addr daddr;
1002 __be16 dport = udest->port;
1006 if (udest->weight < 0) {
1007 pr_err("%s(): server weight less than zero\n", __func__);
1011 if (udest->l_threshold > udest->u_threshold) {
1012 pr_err("%s(): lower threshold is higher than upper threshold\n",
1017 ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
1020 * Lookup the destination list
1022 dest = ip_vs_lookup_dest(svc, &daddr, dport);
1025 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1029 __ip_vs_update_dest(svc, dest, udest, 0);
1037 * Delete a destination (must be already unlinked from the service)
1039 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
1041 struct netns_ipvs *ipvs = net_ipvs(net);
1043 ip_vs_stop_estimator(net, &dest->stats);
1046 * Remove it from the d-linked list with the real services.
1048 write_lock_bh(&ipvs->rs_lock);
1049 ip_vs_rs_unhash(dest);
1050 write_unlock_bh(&ipvs->rs_lock);
1053 * Decrease the refcnt of the dest, and free the dest
1054 * if nobody refers to it (refcnt=0). Otherwise, throw
1055 * the destination into the trash.
1057 if (atomic_dec_and_test(&dest->refcnt)) {
1058 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
1060 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1062 __ip_vs_dst_cache_reset(dest);
1063 /* simply decrease svc->refcnt here, let the caller check
1064 and release the service if nobody refers to it.
1065 Only user context can release destination and service,
1066 and only one user context can update virtual service at a
1067 time, so the operation here is OK */
1068 atomic_dec(&dest->svc->refcnt);
1069 free_percpu(dest->stats.cpustats);
1072 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1073 "dest->refcnt=%d\n",
1074 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1076 atomic_read(&dest->refcnt));
1077 list_add(&dest->n_list, &ipvs->dest_trash);
1078 atomic_inc(&dest->refcnt);
1084 * Unlink a destination from the given service
1086 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1087 struct ip_vs_dest *dest,
1090 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1093 * Remove it from the d-linked destination list.
1095 list_del(&dest->n_list);
1099 * Call the update_service function of its scheduler
1101 if (svcupd && svc->scheduler->update_service)
1102 svc->scheduler->update_service(svc);
1107 * Delete a destination server in the given service
1110 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1112 struct ip_vs_dest *dest;
1113 __be16 dport = udest->port;
1117 dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1120 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1124 write_lock_bh(&__ip_vs_svc_lock);
1127 * Wait until all other svc users go away.
1129 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1132 * Unlink dest from the service
1134 __ip_vs_unlink_dest(svc, dest, 1);
1136 write_unlock_bh(&__ip_vs_svc_lock);
1139 * Delete the destination
1141 __ip_vs_del_dest(svc->net, dest);
1150 * Add a service into the service hash table
1153 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1154 struct ip_vs_service **svc_p)
1157 struct ip_vs_scheduler *sched = NULL;
1158 struct ip_vs_pe *pe = NULL;
1159 struct ip_vs_service *svc = NULL;
1160 struct netns_ipvs *ipvs = net_ipvs(net);
1162 /* increase the module use count */
1163 ip_vs_use_count_inc();
1165 /* Lookup the scheduler by 'u->sched_name' */
1166 sched = ip_vs_scheduler_get(u->sched_name);
1167 if (sched == NULL) {
1168 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1173 if (u->pe_name && *u->pe_name) {
1174 pe = ip_vs_pe_getbyname(u->pe_name);
1176 pr_info("persistence engine module ip_vs_pe_%s "
1177 "not found\n", u->pe_name);
1183 #ifdef CONFIG_IP_VS_IPV6
1184 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1190 svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1192 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1196 svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1197 if (!svc->stats.cpustats) {
1202 /* I'm the first user of the service */
1203 atomic_set(&svc->usecnt, 0);
1204 atomic_set(&svc->refcnt, 0);
1207 svc->protocol = u->protocol;
1208 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1209 svc->port = u->port;
1210 svc->fwmark = u->fwmark;
1211 svc->flags = u->flags;
1212 svc->timeout = u->timeout * HZ;
1213 svc->netmask = u->netmask;
1216 INIT_LIST_HEAD(&svc->destinations);
1217 rwlock_init(&svc->sched_lock);
1218 spin_lock_init(&svc->stats.lock);
1220 /* Bind the scheduler */
1221 ret = ip_vs_bind_scheduler(svc, sched);
1226 /* Bind the ct retriever */
1227 ip_vs_bind_pe(svc, pe);
1230 /* Update the virtual service counters */
1231 if (svc->port == FTPPORT)
1232 atomic_inc(&ipvs->ftpsvc_counter);
1233 else if (svc->port == 0)
1234 atomic_inc(&ipvs->nullsvc_counter);
1236 ip_vs_start_estimator(net, &svc->stats);
1238 /* Count only IPv4 services for old get/setsockopt interface */
1239 if (svc->af == AF_INET)
1240 ipvs->num_services++;
1242 /* Hash the service into the service table */
1243 write_lock_bh(&__ip_vs_svc_lock);
1244 ip_vs_svc_hash(svc);
1245 write_unlock_bh(&__ip_vs_svc_lock);
1248 /* Now there is a service - full throttle */
1255 ip_vs_unbind_scheduler(svc);
1258 ip_vs_app_inc_put(svc->inc);
1261 if (svc->stats.cpustats)
1262 free_percpu(svc->stats.cpustats);
1265 ip_vs_scheduler_put(sched);
1268 /* decrease the module use count */
1269 ip_vs_use_count_dec();
1276 * Edit a service and bind it with a new scheduler
1279 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1281 struct ip_vs_scheduler *sched, *old_sched;
1282 struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1286 * Lookup the scheduler, by 'u->sched_name'
1288 sched = ip_vs_scheduler_get(u->sched_name);
1289 if (sched == NULL) {
1290 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1295 if (u->pe_name && *u->pe_name) {
1296 pe = ip_vs_pe_getbyname(u->pe_name);
1298 pr_info("persistence engine module ip_vs_pe_%s "
1299 "not found\n", u->pe_name);
1306 #ifdef CONFIG_IP_VS_IPV6
1307 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1313 write_lock_bh(&__ip_vs_svc_lock);
1316 * Wait until all other svc users go away.
1318 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1321 * Set the flags and timeout value
1323 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1324 svc->timeout = u->timeout * HZ;
1325 svc->netmask = u->netmask;
1327 old_sched = svc->scheduler;
1328 if (sched != old_sched) {
1330 * Unbind the old scheduler
1332 if ((ret = ip_vs_unbind_scheduler(svc))) {
1338 * Bind the new scheduler
1340 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1342 * If ip_vs_bind_scheduler fails, restore the old
1344 * The main reason of failure is out of memory.
1346 * The question is if the old scheduler can be
1347 * restored all the time. TODO: if it cannot be
1348 * restored some time, we must delete the service,
1349 * otherwise the system may crash.
1351 ip_vs_bind_scheduler(svc, old_sched);
1359 ip_vs_unbind_pe(svc);
1360 ip_vs_bind_pe(svc, pe);
1364 write_unlock_bh(&__ip_vs_svc_lock);
1366 ip_vs_scheduler_put(old_sched);
1367 ip_vs_pe_put(old_pe);
1373 * Delete a service from the service list
1374 * - The service must be unlinked, unlocked and not referenced!
1375 * - We are called under _bh lock
1377 static void __ip_vs_del_service(struct ip_vs_service *svc)
1379 struct ip_vs_dest *dest, *nxt;
1380 struct ip_vs_scheduler *old_sched;
1381 struct ip_vs_pe *old_pe;
1382 struct netns_ipvs *ipvs = net_ipvs(svc->net);
1384 pr_info("%s: enter\n", __func__);
1386 /* Count only IPv4 services for old get/setsockopt interface */
1387 if (svc->af == AF_INET)
1388 ipvs->num_services--;
1390 ip_vs_stop_estimator(svc->net, &svc->stats);
1392 /* Unbind scheduler */
1393 old_sched = svc->scheduler;
1394 ip_vs_unbind_scheduler(svc);
1395 ip_vs_scheduler_put(old_sched);
1397 /* Unbind persistence engine */
1399 ip_vs_unbind_pe(svc);
1400 ip_vs_pe_put(old_pe);
1402 /* Unbind app inc */
1404 ip_vs_app_inc_put(svc->inc);
1409 * Unlink the whole destination list
1411 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1412 __ip_vs_unlink_dest(svc, dest, 0);
1413 __ip_vs_del_dest(svc->net, dest);
1417 * Update the virtual service counters
1419 if (svc->port == FTPPORT)
1420 atomic_dec(&ipvs->ftpsvc_counter);
1421 else if (svc->port == 0)
1422 atomic_dec(&ipvs->nullsvc_counter);
1425 * Free the service if nobody refers to it
1427 if (atomic_read(&svc->refcnt) == 0) {
1428 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1430 IP_VS_DBG_ADDR(svc->af, &svc->addr),
1431 ntohs(svc->port), atomic_read(&svc->usecnt));
1432 free_percpu(svc->stats.cpustats);
1436 /* decrease the module use count */
1437 ip_vs_use_count_dec();
1441 * Unlink a service from list and try to delete it if its refcnt reached 0
1443 static void ip_vs_unlink_service(struct ip_vs_service *svc)
1446 * Unhash it from the service table
1448 write_lock_bh(&__ip_vs_svc_lock);
1450 ip_vs_svc_unhash(svc);
1453 * Wait until all the svc users go away.
1455 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1457 __ip_vs_del_service(svc);
1459 write_unlock_bh(&__ip_vs_svc_lock);
1463 * Delete a service from the service list
1465 static int ip_vs_del_service(struct ip_vs_service *svc)
1469 ip_vs_unlink_service(svc);
1476 * Flush all the virtual services
1478 static int ip_vs_flush(struct net *net)
1481 struct ip_vs_service *svc, *nxt;
1484 * Flush the service table hashed by <netns,protocol,addr,port>
1486 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1487 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1489 if (net_eq(svc->net, net))
1490 ip_vs_unlink_service(svc);
1495 * Flush the service table hashed by fwmark
1497 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1498 list_for_each_entry_safe(svc, nxt,
1499 &ip_vs_svc_fwm_table[idx], f_list) {
1500 if (net_eq(svc->net, net))
1501 ip_vs_unlink_service(svc);
1509 * Delete service by {netns} in the service table.
1510 * Called by __ip_vs_cleanup()
1512 void ip_vs_service_net_cleanup(struct net *net)
1515 /* Check for "full" addressed entries */
1516 mutex_lock(&__ip_vs_mutex);
1518 mutex_unlock(&__ip_vs_mutex);
1522 /* Put all references for device (dst_cache) */
1524 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
1526 spin_lock_bh(&dest->dst_lock);
1527 if (dest->dest_dst && dest->dest_dst->dst_cache->dev == dev) {
1528 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
1530 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1532 atomic_read(&dest->refcnt));
1533 __ip_vs_dst_cache_reset(dest);
1535 spin_unlock_bh(&dest->dst_lock);
1538 /* Netdev event receiver
1539 * Currently only NETDEV_DOWN is handled to release refs to cached dsts
1541 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
1544 struct net_device *dev = ptr;
1545 struct net *net = dev_net(dev);
1546 struct netns_ipvs *ipvs = net_ipvs(net);
1547 struct ip_vs_service *svc;
1548 struct ip_vs_dest *dest;
1551 if (event != NETDEV_DOWN || !ipvs)
1553 IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
1555 mutex_lock(&__ip_vs_mutex);
1556 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1557 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1558 if (net_eq(svc->net, net)) {
1559 list_for_each_entry(dest, &svc->destinations,
1561 ip_vs_forget_dev(dest, dev);
1566 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1567 if (net_eq(svc->net, net)) {
1568 list_for_each_entry(dest, &svc->destinations,
1570 ip_vs_forget_dev(dest, dev);
1577 list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
1578 ip_vs_forget_dev(dest, dev);
1580 mutex_unlock(&__ip_vs_mutex);
1586 * Zero counters in a service or all services
1588 static int ip_vs_zero_service(struct ip_vs_service *svc)
1590 struct ip_vs_dest *dest;
1592 write_lock_bh(&__ip_vs_svc_lock);
1593 list_for_each_entry(dest, &svc->destinations, n_list) {
1594 ip_vs_zero_stats(&dest->stats);
1596 ip_vs_zero_stats(&svc->stats);
1597 write_unlock_bh(&__ip_vs_svc_lock);
1601 static int ip_vs_zero_all(struct net *net)
1604 struct ip_vs_service *svc;
1606 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1607 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1608 if (net_eq(svc->net, net))
1609 ip_vs_zero_service(svc);
1613 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1614 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1615 if (net_eq(svc->net, net))
1616 ip_vs_zero_service(svc);
1620 ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
1624 #ifdef CONFIG_SYSCTL
1627 static int three = 3;
1630 proc_do_defense_mode(ctl_table *table, int write,
1631 void __user *buffer, size_t *lenp, loff_t *ppos)
1633 struct net *net = current->nsproxy->net_ns;
1634 int *valp = table->data;
1638 rc = proc_dointvec(table, write, buffer, lenp, ppos);
1639 if (write && (*valp != val)) {
1640 if ((*valp < 0) || (*valp > 3)) {
1641 /* Restore the correct value */
1644 update_defense_level(net_ipvs(net));
1651 proc_do_sync_threshold(ctl_table *table, int write,
1652 void __user *buffer, size_t *lenp, loff_t *ppos)
1654 int *valp = table->data;
1658 /* backup the value first */
1659 memcpy(val, valp, sizeof(val));
1661 rc = proc_dointvec(table, write, buffer, lenp, ppos);
1662 if (write && (valp[0] < 0 || valp[1] < 0 ||
1663 (valp[0] >= valp[1] && valp[1]))) {
1664 /* Restore the correct value */
1665 memcpy(valp, val, sizeof(val));
1671 proc_do_sync_mode(ctl_table *table, int write,
1672 void __user *buffer, size_t *lenp, loff_t *ppos)
1674 int *valp = table->data;
1678 rc = proc_dointvec(table, write, buffer, lenp, ppos);
1679 if (write && (*valp != val)) {
1680 if ((*valp < 0) || (*valp > 1)) {
1681 /* Restore the correct value */
1689 proc_do_sync_ports(ctl_table *table, int write,
1690 void __user *buffer, size_t *lenp, loff_t *ppos)
1692 int *valp = table->data;
1696 rc = proc_dointvec(table, write, buffer, lenp, ppos);
1697 if (write && (*valp != val)) {
1698 if (*valp < 1 || !is_power_of_2(*valp)) {
1699 /* Restore the correct value */
1707 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1708 * Do not change order or insert new entries without
1709 * align with netns init in ip_vs_control_net_init()
1712 static struct ctl_table vs_vars[] = {
1714 .procname = "amemthresh",
1715 .maxlen = sizeof(int),
1717 .proc_handler = proc_dointvec,
1720 .procname = "am_droprate",
1721 .maxlen = sizeof(int),
1723 .proc_handler = proc_dointvec,
1726 .procname = "drop_entry",
1727 .maxlen = sizeof(int),
1729 .proc_handler = proc_do_defense_mode,
1732 .procname = "drop_packet",
1733 .maxlen = sizeof(int),
1735 .proc_handler = proc_do_defense_mode,
1737 #ifdef CONFIG_IP_VS_NFCT
1739 .procname = "conntrack",
1740 .maxlen = sizeof(int),
1742 .proc_handler = &proc_dointvec,
1746 .procname = "secure_tcp",
1747 .maxlen = sizeof(int),
1749 .proc_handler = proc_do_defense_mode,
1752 .procname = "snat_reroute",
1753 .maxlen = sizeof(int),
1755 .proc_handler = &proc_dointvec,
1758 .procname = "sync_version",
1759 .maxlen = sizeof(int),
1761 .proc_handler = &proc_do_sync_mode,
1764 .procname = "sync_ports",
1765 .maxlen = sizeof(int),
1767 .proc_handler = &proc_do_sync_ports,
1770 .procname = "sync_qlen_max",
1771 .maxlen = sizeof(int),
1773 .proc_handler = proc_dointvec,
1776 .procname = "sync_sock_size",
1777 .maxlen = sizeof(int),
1779 .proc_handler = proc_dointvec,
1782 .procname = "cache_bypass",
1783 .maxlen = sizeof(int),
1785 .proc_handler = proc_dointvec,
1788 .procname = "expire_nodest_conn",
1789 .maxlen = sizeof(int),
1791 .proc_handler = proc_dointvec,
1794 .procname = "expire_quiescent_template",
1795 .maxlen = sizeof(int),
1797 .proc_handler = proc_dointvec,
1800 .procname = "sync_threshold",
1802 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1804 .proc_handler = proc_do_sync_threshold,
1807 .procname = "sync_refresh_period",
1808 .maxlen = sizeof(int),
1810 .proc_handler = proc_dointvec_jiffies,
1813 .procname = "sync_retries",
1814 .maxlen = sizeof(int),
1816 .proc_handler = proc_dointvec_minmax,
1821 .procname = "nat_icmp_send",
1822 .maxlen = sizeof(int),
1824 .proc_handler = proc_dointvec,
1827 .procname = "pmtu_disc",
1828 .maxlen = sizeof(int),
1830 .proc_handler = proc_dointvec,
1833 .procname = "backup_only",
1834 .maxlen = sizeof(int),
1836 .proc_handler = proc_dointvec,
1838 #ifdef CONFIG_IP_VS_DEBUG
1840 .procname = "debug_level",
1841 .data = &sysctl_ip_vs_debug_level,
1842 .maxlen = sizeof(int),
1844 .proc_handler = proc_dointvec,
1849 .procname = "timeout_established",
1850 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1851 .maxlen = sizeof(int),
1853 .proc_handler = proc_dointvec_jiffies,
1856 .procname = "timeout_synsent",
1857 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1858 .maxlen = sizeof(int),
1860 .proc_handler = proc_dointvec_jiffies,
1863 .procname = "timeout_synrecv",
1864 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1865 .maxlen = sizeof(int),
1867 .proc_handler = proc_dointvec_jiffies,
1870 .procname = "timeout_finwait",
1871 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1872 .maxlen = sizeof(int),
1874 .proc_handler = proc_dointvec_jiffies,
1877 .procname = "timeout_timewait",
1878 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1879 .maxlen = sizeof(int),
1881 .proc_handler = proc_dointvec_jiffies,
1884 .procname = "timeout_close",
1885 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1886 .maxlen = sizeof(int),
1888 .proc_handler = proc_dointvec_jiffies,
1891 .procname = "timeout_closewait",
1892 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1893 .maxlen = sizeof(int),
1895 .proc_handler = proc_dointvec_jiffies,
1898 .procname = "timeout_lastack",
1899 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1900 .maxlen = sizeof(int),
1902 .proc_handler = proc_dointvec_jiffies,
1905 .procname = "timeout_listen",
1906 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1907 .maxlen = sizeof(int),
1909 .proc_handler = proc_dointvec_jiffies,
1912 .procname = "timeout_synack",
1913 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1914 .maxlen = sizeof(int),
1916 .proc_handler = proc_dointvec_jiffies,
1919 .procname = "timeout_udp",
1920 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1921 .maxlen = sizeof(int),
1923 .proc_handler = proc_dointvec_jiffies,
1926 .procname = "timeout_icmp",
1927 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1928 .maxlen = sizeof(int),
1930 .proc_handler = proc_dointvec_jiffies,
1938 #ifdef CONFIG_PROC_FS
1941 struct seq_net_private p; /* Do not move this, netns depends upon it*/
1942 struct list_head *table;
1947 * Write the contents of the VS rule table to a PROCfs file.
1948 * (It is kept just for backward compatibility)
1950 static inline const char *ip_vs_fwd_name(unsigned int flags)
1952 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1953 case IP_VS_CONN_F_LOCALNODE:
1955 case IP_VS_CONN_F_TUNNEL:
1957 case IP_VS_CONN_F_DROUTE:
1965 /* Get the Nth entry in the two lists */
1966 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1968 struct net *net = seq_file_net(seq);
1969 struct ip_vs_iter *iter = seq->private;
1971 struct ip_vs_service *svc;
1973 /* look in hash by protocol */
1974 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1975 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1976 if (net_eq(svc->net, net) && pos-- == 0) {
1977 iter->table = ip_vs_svc_table;
1984 /* keep looking in fwmark */
1985 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1986 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1987 if (net_eq(svc->net, net) && pos-- == 0) {
1988 iter->table = ip_vs_svc_fwm_table;
1998 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1999 __acquires(__ip_vs_svc_lock)
2002 read_lock_bh(&__ip_vs_svc_lock);
2003 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
2007 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2009 struct list_head *e;
2010 struct ip_vs_iter *iter;
2011 struct ip_vs_service *svc;
2014 if (v == SEQ_START_TOKEN)
2015 return ip_vs_info_array(seq,0);
2018 iter = seq->private;
2020 if (iter->table == ip_vs_svc_table) {
2021 /* next service in table hashed by protocol */
2022 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
2023 return list_entry(e, struct ip_vs_service, s_list);
2026 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2027 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
2033 iter->table = ip_vs_svc_fwm_table;
2038 /* next service in hashed by fwmark */
2039 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
2040 return list_entry(e, struct ip_vs_service, f_list);
2043 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2044 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
2052 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
2053 __releases(__ip_vs_svc_lock)
2055 read_unlock_bh(&__ip_vs_svc_lock);
2059 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
2061 if (v == SEQ_START_TOKEN) {
2063 "IP Virtual Server version %d.%d.%d (size=%d)\n",
2064 NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2066 "Prot LocalAddress:Port Scheduler Flags\n");
2068 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2070 const struct ip_vs_service *svc = v;
2071 const struct ip_vs_iter *iter = seq->private;
2072 const struct ip_vs_dest *dest;
2074 if (iter->table == ip_vs_svc_table) {
2075 #ifdef CONFIG_IP_VS_IPV6
2076 if (svc->af == AF_INET6)
2077 seq_printf(seq, "%s [%pI6]:%04X %s ",
2078 ip_vs_proto_name(svc->protocol),
2081 svc->scheduler->name);
2084 seq_printf(seq, "%s %08X:%04X %s %s ",
2085 ip_vs_proto_name(svc->protocol),
2086 ntohl(svc->addr.ip),
2088 svc->scheduler->name,
2089 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2091 seq_printf(seq, "FWM %08X %s %s",
2092 svc->fwmark, svc->scheduler->name,
2093 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2096 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2097 seq_printf(seq, "persistent %d %08X\n",
2099 ntohl(svc->netmask));
2101 seq_putc(seq, '\n');
2103 list_for_each_entry(dest, &svc->destinations, n_list) {
2104 #ifdef CONFIG_IP_VS_IPV6
2105 if (dest->af == AF_INET6)
2108 " %-7s %-6d %-10d %-10d\n",
2111 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2112 atomic_read(&dest->weight),
2113 atomic_read(&dest->activeconns),
2114 atomic_read(&dest->inactconns));
2119 "%-7s %-6d %-10d %-10d\n",
2120 ntohl(dest->addr.ip),
2122 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2123 atomic_read(&dest->weight),
2124 atomic_read(&dest->activeconns),
2125 atomic_read(&dest->inactconns));
2132 static const struct seq_operations ip_vs_info_seq_ops = {
2133 .start = ip_vs_info_seq_start,
2134 .next = ip_vs_info_seq_next,
2135 .stop = ip_vs_info_seq_stop,
2136 .show = ip_vs_info_seq_show,
2139 static int ip_vs_info_open(struct inode *inode, struct file *file)
2141 return seq_open_net(inode, file, &ip_vs_info_seq_ops,
2142 sizeof(struct ip_vs_iter));
2145 static const struct file_operations ip_vs_info_fops = {
2146 .owner = THIS_MODULE,
2147 .open = ip_vs_info_open,
2149 .llseek = seq_lseek,
2150 .release = seq_release_net,
2153 static int ip_vs_stats_show(struct seq_file *seq, void *v)
2155 struct net *net = seq_file_single_net(seq);
2156 struct ip_vs_stats_user show;
2158 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2160 " Total Incoming Outgoing Incoming Outgoing\n");
2162 " Conns Packets Packets Bytes Bytes\n");
2164 ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
2165 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
2166 show.inpkts, show.outpkts,
2167 (unsigned long long) show.inbytes,
2168 (unsigned long long) show.outbytes);
2170 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2172 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
2173 seq_printf(seq, "%8X %8X %8X %16X %16X\n",
2174 show.cps, show.inpps, show.outpps,
2175 show.inbps, show.outbps);
2180 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
2182 return single_open_net(inode, file, ip_vs_stats_show);
2185 static const struct file_operations ip_vs_stats_fops = {
2186 .owner = THIS_MODULE,
2187 .open = ip_vs_stats_seq_open,
2189 .llseek = seq_lseek,
2190 .release = single_release_net,
2193 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2195 struct net *net = seq_file_single_net(seq);
2196 struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2197 struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
2198 struct ip_vs_stats_user rates;
2201 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2203 " Total Incoming Outgoing Incoming Outgoing\n");
2205 "CPU Conns Packets Packets Bytes Bytes\n");
2207 for_each_possible_cpu(i) {
2208 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2210 __u64 inbytes, outbytes;
2213 start = u64_stats_fetch_begin_bh(&u->syncp);
2214 inbytes = u->ustats.inbytes;
2215 outbytes = u->ustats.outbytes;
2216 } while (u64_stats_fetch_retry_bh(&u->syncp, start));
2218 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2219 i, u->ustats.conns, u->ustats.inpkts,
2220 u->ustats.outpkts, (__u64)inbytes,
2224 spin_lock_bh(&tot_stats->lock);
2226 seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n",
2227 tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2228 tot_stats->ustats.outpkts,
2229 (unsigned long long) tot_stats->ustats.inbytes,
2230 (unsigned long long) tot_stats->ustats.outbytes);
2232 ip_vs_read_estimator(&rates, tot_stats);
2234 spin_unlock_bh(&tot_stats->lock);
2236 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2238 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
2239 seq_printf(seq, " %8X %8X %8X %16X %16X\n",
2249 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2251 return single_open_net(inode, file, ip_vs_stats_percpu_show);
2254 static const struct file_operations ip_vs_stats_percpu_fops = {
2255 .owner = THIS_MODULE,
2256 .open = ip_vs_stats_percpu_seq_open,
2258 .llseek = seq_lseek,
2259 .release = single_release_net,
2264 * Set timeout values for tcp tcpfin udp in the timeout_table.
2266 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2268 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2269 struct ip_vs_proto_data *pd;
2272 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2277 #ifdef CONFIG_IP_VS_PROTO_TCP
2278 if (u->tcp_timeout) {
2279 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2280 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2281 = u->tcp_timeout * HZ;
2284 if (u->tcp_fin_timeout) {
2285 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2286 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2287 = u->tcp_fin_timeout * HZ;
2291 #ifdef CONFIG_IP_VS_PROTO_UDP
2292 if (u->udp_timeout) {
2293 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2294 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2295 = u->udp_timeout * HZ;
2302 #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2303 #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
2304 #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
2305 sizeof(struct ip_vs_dest_user))
2306 #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2307 #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
2308 #define MAX_ARG_LEN SVCDEST_ARG_LEN
2310 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2311 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
2312 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
2313 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
2314 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
2315 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
2316 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
2317 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
2318 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
2319 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
2320 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
2321 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
2324 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2325 struct ip_vs_service_user *usvc_compat)
2327 memset(usvc, 0, sizeof(*usvc));
2330 usvc->protocol = usvc_compat->protocol;
2331 usvc->addr.ip = usvc_compat->addr;
2332 usvc->port = usvc_compat->port;
2333 usvc->fwmark = usvc_compat->fwmark;
2335 /* Deep copy of sched_name is not needed here */
2336 usvc->sched_name = usvc_compat->sched_name;
2338 usvc->flags = usvc_compat->flags;
2339 usvc->timeout = usvc_compat->timeout;
2340 usvc->netmask = usvc_compat->netmask;
2343 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2344 struct ip_vs_dest_user *udest_compat)
2346 memset(udest, 0, sizeof(*udest));
2348 udest->addr.ip = udest_compat->addr;
2349 udest->port = udest_compat->port;
2350 udest->conn_flags = udest_compat->conn_flags;
2351 udest->weight = udest_compat->weight;
2352 udest->u_threshold = udest_compat->u_threshold;
2353 udest->l_threshold = udest_compat->l_threshold;
2357 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2359 struct net *net = sock_net(sk);
2361 unsigned char arg[MAX_ARG_LEN];
2362 struct ip_vs_service_user *usvc_compat;
2363 struct ip_vs_service_user_kern usvc;
2364 struct ip_vs_service *svc;
2365 struct ip_vs_dest_user *udest_compat;
2366 struct ip_vs_dest_user_kern udest;
2367 struct netns_ipvs *ipvs = net_ipvs(net);
2369 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2372 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2374 if (len < 0 || len > MAX_ARG_LEN)
2376 if (len != set_arglen[SET_CMDID(cmd)]) {
2377 pr_err("set_ctl: len %u != %u\n",
2378 len, set_arglen[SET_CMDID(cmd)]);
2382 if (copy_from_user(arg, user, len) != 0)
2385 /* increase the module use count */
2386 ip_vs_use_count_inc();
2388 /* Handle daemons since they have another lock */
2389 if (cmd == IP_VS_SO_SET_STARTDAEMON ||
2390 cmd == IP_VS_SO_SET_STOPDAEMON) {
2391 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2393 if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
2397 if (cmd == IP_VS_SO_SET_STARTDAEMON)
2398 ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2401 ret = stop_sync_thread(net, dm->state);
2402 mutex_unlock(&ipvs->sync_mutex);
2406 if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2411 if (cmd == IP_VS_SO_SET_FLUSH) {
2412 /* Flush the virtual service */
2413 ret = ip_vs_flush(net);
2415 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2416 /* Set timeout values for (tcp tcpfin udp) */
2417 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2421 usvc_compat = (struct ip_vs_service_user *)arg;
2422 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2424 /* We only use the new structs internally, so copy userspace compat
2425 * structs to extended internal versions */
2426 ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2427 ip_vs_copy_udest_compat(&udest, udest_compat);
2429 if (cmd == IP_VS_SO_SET_ZERO) {
2430 /* if no service address is set, zero counters in all */
2431 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2432 ret = ip_vs_zero_all(net);
2437 /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2438 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2439 usvc.protocol != IPPROTO_SCTP) {
2440 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2441 usvc.protocol, &usvc.addr.ip,
2442 ntohs(usvc.port), usvc.sched_name);
2447 /* Lookup the exact service by <protocol, addr, port> or fwmark */
2448 if (usvc.fwmark == 0)
2449 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2450 &usvc.addr, usvc.port);
2452 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2454 if (cmd != IP_VS_SO_SET_ADD
2455 && (svc == NULL || svc->protocol != usvc.protocol)) {
2461 case IP_VS_SO_SET_ADD:
2465 ret = ip_vs_add_service(net, &usvc, &svc);
2467 case IP_VS_SO_SET_EDIT:
2468 ret = ip_vs_edit_service(svc, &usvc);
2470 case IP_VS_SO_SET_DEL:
2471 ret = ip_vs_del_service(svc);
2475 case IP_VS_SO_SET_ZERO:
2476 ret = ip_vs_zero_service(svc);
2478 case IP_VS_SO_SET_ADDDEST:
2479 ret = ip_vs_add_dest(svc, &udest);
2481 case IP_VS_SO_SET_EDITDEST:
2482 ret = ip_vs_edit_dest(svc, &udest);
2484 case IP_VS_SO_SET_DELDEST:
2485 ret = ip_vs_del_dest(svc, &udest);
2492 mutex_unlock(&__ip_vs_mutex);
2494 /* decrease the module use count */
2495 ip_vs_use_count_dec();
2502 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2504 dst->protocol = src->protocol;
2505 dst->addr = src->addr.ip;
2506 dst->port = src->port;
2507 dst->fwmark = src->fwmark;
2508 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2509 dst->flags = src->flags;
2510 dst->timeout = src->timeout / HZ;
2511 dst->netmask = src->netmask;
2512 dst->num_dests = src->num_dests;
2513 ip_vs_copy_stats(&dst->stats, &src->stats);
2517 __ip_vs_get_service_entries(struct net *net,
2518 const struct ip_vs_get_services *get,
2519 struct ip_vs_get_services __user *uptr)
2522 struct ip_vs_service *svc;
2523 struct ip_vs_service_entry entry;
2526 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2527 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2528 /* Only expose IPv4 entries to old interface */
2529 if (svc->af != AF_INET || !net_eq(svc->net, net))
2532 if (count >= get->num_services)
2534 memset(&entry, 0, sizeof(entry));
2535 ip_vs_copy_service(&entry, svc);
2536 if (copy_to_user(&uptr->entrytable[count],
2537 &entry, sizeof(entry))) {
2545 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2546 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2547 /* Only expose IPv4 entries to old interface */
2548 if (svc->af != AF_INET || !net_eq(svc->net, net))
2551 if (count >= get->num_services)
2553 memset(&entry, 0, sizeof(entry));
2554 ip_vs_copy_service(&entry, svc);
2555 if (copy_to_user(&uptr->entrytable[count],
2556 &entry, sizeof(entry))) {
2568 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2569 struct ip_vs_get_dests __user *uptr)
2571 struct ip_vs_service *svc;
2572 union nf_inet_addr addr = { .ip = get->addr };
2576 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2578 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2583 struct ip_vs_dest *dest;
2584 struct ip_vs_dest_entry entry;
2586 list_for_each_entry(dest, &svc->destinations, n_list) {
2587 if (count >= get->num_dests)
2590 entry.addr = dest->addr.ip;
2591 entry.port = dest->port;
2592 entry.conn_flags = atomic_read(&dest->conn_flags);
2593 entry.weight = atomic_read(&dest->weight);
2594 entry.u_threshold = dest->u_threshold;
2595 entry.l_threshold = dest->l_threshold;
2596 entry.activeconns = atomic_read(&dest->activeconns);
2597 entry.inactconns = atomic_read(&dest->inactconns);
2598 entry.persistconns = atomic_read(&dest->persistconns);
2599 ip_vs_copy_stats(&entry.stats, &dest->stats);
2600 if (copy_to_user(&uptr->entrytable[count],
2601 &entry, sizeof(entry))) {
2613 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2615 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2616 struct ip_vs_proto_data *pd;
2619 memset(u, 0, sizeof (*u));
2621 #ifdef CONFIG_IP_VS_PROTO_TCP
2622 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2623 u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2624 u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2626 #ifdef CONFIG_IP_VS_PROTO_UDP
2627 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2629 pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2634 #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2635 #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2636 #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2637 #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2638 #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2639 #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2640 #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2642 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2643 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2644 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2645 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2646 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2647 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2648 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2649 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2653 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2655 unsigned char arg[128];
2657 unsigned int copylen;
2658 struct net *net = sock_net(sk);
2659 struct netns_ipvs *ipvs = net_ipvs(net);
2662 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2665 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2668 if (*len < get_arglen[GET_CMDID(cmd)]) {
2669 pr_err("get_ctl: len %u < %u\n",
2670 *len, get_arglen[GET_CMDID(cmd)]);
2674 copylen = get_arglen[GET_CMDID(cmd)];
2678 if (copy_from_user(arg, user, copylen) != 0)
2681 * Handle daemons first since it has its own locking
2683 if (cmd == IP_VS_SO_GET_DAEMON) {
2684 struct ip_vs_daemon_user d[2];
2686 memset(&d, 0, sizeof(d));
2687 if (mutex_lock_interruptible(&ipvs->sync_mutex))
2688 return -ERESTARTSYS;
2690 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2691 d[0].state = IP_VS_STATE_MASTER;
2692 strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2693 sizeof(d[0].mcast_ifn));
2694 d[0].syncid = ipvs->master_syncid;
2696 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2697 d[1].state = IP_VS_STATE_BACKUP;
2698 strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2699 sizeof(d[1].mcast_ifn));
2700 d[1].syncid = ipvs->backup_syncid;
2702 if (copy_to_user(user, &d, sizeof(d)) != 0)
2704 mutex_unlock(&ipvs->sync_mutex);
2708 if (mutex_lock_interruptible(&__ip_vs_mutex))
2709 return -ERESTARTSYS;
2712 case IP_VS_SO_GET_VERSION:
2716 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2717 NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2718 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2722 *len = strlen(buf)+1;
2726 case IP_VS_SO_GET_INFO:
2728 struct ip_vs_getinfo info;
2729 info.version = IP_VS_VERSION_CODE;
2730 info.size = ip_vs_conn_tab_size;
2731 info.num_services = ipvs->num_services;
2732 if (copy_to_user(user, &info, sizeof(info)) != 0)
2737 case IP_VS_SO_GET_SERVICES:
2739 struct ip_vs_get_services *get;
2742 get = (struct ip_vs_get_services *)arg;
2743 size = sizeof(*get) +
2744 sizeof(struct ip_vs_service_entry) * get->num_services;
2746 pr_err("length: %u != %u\n", *len, size);
2750 ret = __ip_vs_get_service_entries(net, get, user);
2754 case IP_VS_SO_GET_SERVICE:
2756 struct ip_vs_service_entry *entry;
2757 struct ip_vs_service *svc;
2758 union nf_inet_addr addr;
2760 entry = (struct ip_vs_service_entry *)arg;
2761 addr.ip = entry->addr;
2763 svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2765 svc = __ip_vs_service_find(net, AF_INET,
2766 entry->protocol, &addr,
2769 ip_vs_copy_service(entry, svc);
2770 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2777 case IP_VS_SO_GET_DESTS:
2779 struct ip_vs_get_dests *get;
2782 get = (struct ip_vs_get_dests *)arg;
2783 size = sizeof(*get) +
2784 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2786 pr_err("length: %u != %u\n", *len, size);
2790 ret = __ip_vs_get_dest_entries(net, get, user);
2794 case IP_VS_SO_GET_TIMEOUT:
2796 struct ip_vs_timeout_user t;
2798 __ip_vs_get_timeouts(net, &t);
2799 if (copy_to_user(user, &t, sizeof(t)) != 0)
2809 mutex_unlock(&__ip_vs_mutex);
2814 static struct nf_sockopt_ops ip_vs_sockopts = {
2816 .set_optmin = IP_VS_BASE_CTL,
2817 .set_optmax = IP_VS_SO_SET_MAX+1,
2818 .set = do_ip_vs_set_ctl,
2819 .get_optmin = IP_VS_BASE_CTL,
2820 .get_optmax = IP_VS_SO_GET_MAX+1,
2821 .get = do_ip_vs_get_ctl,
2822 .owner = THIS_MODULE,
2826 * Generic Netlink interface
2829 /* IPVS genetlink family */
2830 static struct genl_family ip_vs_genl_family = {
2831 .id = GENL_ID_GENERATE,
2833 .name = IPVS_GENL_NAME,
2834 .version = IPVS_GENL_VERSION,
2835 .maxattr = IPVS_CMD_MAX,
2836 .netnsok = true, /* Make ipvsadm to work on netns */
2839 /* Policy used for first-level command attributes */
2840 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2841 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
2842 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
2843 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
2844 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
2845 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2846 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
2849 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2850 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2851 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
2852 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
2853 .len = IP_VS_IFNAME_MAXLEN },
2854 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
2857 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2858 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2859 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
2860 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
2861 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
2862 .len = sizeof(union nf_inet_addr) },
2863 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
2864 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
2865 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
2866 .len = IP_VS_SCHEDNAME_MAXLEN },
2867 [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
2868 .len = IP_VS_PENAME_MAXLEN },
2869 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
2870 .len = sizeof(struct ip_vs_flags) },
2871 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
2872 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
2873 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
2876 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2877 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2878 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
2879 .len = sizeof(union nf_inet_addr) },
2880 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
2881 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
2882 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
2883 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
2884 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
2885 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
2886 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
2887 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
2888 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
2891 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2892 struct ip_vs_stats *stats)
2894 struct ip_vs_stats_user ustats;
2895 struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2899 ip_vs_copy_stats(&ustats, stats);
2901 if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) ||
2902 nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) ||
2903 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) ||
2904 nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) ||
2905 nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) ||
2906 nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) ||
2907 nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) ||
2908 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) ||
2909 nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) ||
2910 nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps))
2911 goto nla_put_failure;
2912 nla_nest_end(skb, nl_stats);
2917 nla_nest_cancel(skb, nl_stats);
2921 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2922 struct ip_vs_service *svc)
2924 struct nlattr *nl_service;
2925 struct ip_vs_flags flags = { .flags = svc->flags,
2928 nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2932 if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
2933 goto nla_put_failure;
2935 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
2936 goto nla_put_failure;
2938 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
2939 nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
2940 nla_put_u16(skb, IPVS_SVC_ATTR_PORT, svc->port))
2941 goto nla_put_failure;
2944 if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) ||
2946 nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) ||
2947 nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
2948 nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
2949 nla_put_u32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
2950 goto nla_put_failure;
2951 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2952 goto nla_put_failure;
2954 nla_nest_end(skb, nl_service);
2959 nla_nest_cancel(skb, nl_service);
2963 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2964 struct ip_vs_service *svc,
2965 struct netlink_callback *cb)
2969 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
2970 &ip_vs_genl_family, NLM_F_MULTI,
2971 IPVS_CMD_NEW_SERVICE);
2975 if (ip_vs_genl_fill_service(skb, svc) < 0)
2976 goto nla_put_failure;
2978 return genlmsg_end(skb, hdr);
2981 genlmsg_cancel(skb, hdr);
2985 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2986 struct netlink_callback *cb)
2989 int start = cb->args[0];
2990 struct ip_vs_service *svc;
2991 struct net *net = skb_sknet(skb);
2993 mutex_lock(&__ip_vs_mutex);
2994 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2995 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2996 if (++idx <= start || !net_eq(svc->net, net))
2998 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
3000 goto nla_put_failure;
3005 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
3006 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
3007 if (++idx <= start || !net_eq(svc->net, net))
3009 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
3011 goto nla_put_failure;
3017 mutex_unlock(&__ip_vs_mutex);
3023 static int ip_vs_genl_parse_service(struct net *net,
3024 struct ip_vs_service_user_kern *usvc,
3025 struct nlattr *nla, int full_entry,
3026 struct ip_vs_service **ret_svc)
3028 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
3029 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
3030 struct ip_vs_service *svc;
3032 /* Parse mandatory identifying service fields first */
3034 nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
3037 nla_af = attrs[IPVS_SVC_ATTR_AF];
3038 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
3039 nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
3040 nla_port = attrs[IPVS_SVC_ATTR_PORT];
3041 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
3043 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
3046 memset(usvc, 0, sizeof(*usvc));
3048 usvc->af = nla_get_u16(nla_af);
3049 #ifdef CONFIG_IP_VS_IPV6
3050 if (usvc->af != AF_INET && usvc->af != AF_INET6)
3052 if (usvc->af != AF_INET)
3054 return -EAFNOSUPPORT;
3057 usvc->protocol = IPPROTO_TCP;
3058 usvc->fwmark = nla_get_u32(nla_fwmark);
3060 usvc->protocol = nla_get_u16(nla_protocol);
3061 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
3062 usvc->port = nla_get_u16(nla_port);
3067 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
3069 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
3070 &usvc->addr, usvc->port);
3073 /* If a full entry was requested, check for the additional fields */
3075 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
3077 struct ip_vs_flags flags;
3079 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
3080 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
3081 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
3082 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
3083 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
3085 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
3088 nla_memcpy(&flags, nla_flags, sizeof(flags));
3090 /* prefill flags from service if it already exists */
3092 usvc->flags = svc->flags;
3094 /* set new flags from userland */
3095 usvc->flags = (usvc->flags & ~flags.mask) |
3096 (flags.flags & flags.mask);
3097 usvc->sched_name = nla_data(nla_sched);
3098 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
3099 usvc->timeout = nla_get_u32(nla_timeout);
3100 usvc->netmask = nla_get_u32(nla_netmask);
3106 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
3109 struct ip_vs_service_user_kern usvc;
3110 struct ip_vs_service *svc;
3113 ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
3114 return ret ? ERR_PTR(ret) : svc;
3117 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
3119 struct nlattr *nl_dest;
3121 nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
3125 if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
3126 nla_put_u16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
3127 nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
3128 (atomic_read(&dest->conn_flags) &
3129 IP_VS_CONN_F_FWD_MASK)) ||
3130 nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
3131 atomic_read(&dest->weight)) ||
3132 nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
3133 nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
3134 nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
3135 atomic_read(&dest->activeconns)) ||
3136 nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
3137 atomic_read(&dest->inactconns)) ||
3138 nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
3139 atomic_read(&dest->persistconns)))
3140 goto nla_put_failure;
3141 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
3142 goto nla_put_failure;
3144 nla_nest_end(skb, nl_dest);
3149 nla_nest_cancel(skb, nl_dest);
3153 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
3154 struct netlink_callback *cb)
3158 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3159 &ip_vs_genl_family, NLM_F_MULTI,
3164 if (ip_vs_genl_fill_dest(skb, dest) < 0)
3165 goto nla_put_failure;
3167 return genlmsg_end(skb, hdr);
3170 genlmsg_cancel(skb, hdr);
3174 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
3175 struct netlink_callback *cb)
3178 int start = cb->args[0];
3179 struct ip_vs_service *svc;
3180 struct ip_vs_dest *dest;
3181 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
3182 struct net *net = skb_sknet(skb);
3184 mutex_lock(&__ip_vs_mutex);
3186 /* Try to find the service for which to dump destinations */
3187 if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
3188 IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
3192 svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
3193 if (IS_ERR(svc) || svc == NULL)
3196 /* Dump the destinations */
3197 list_for_each_entry(dest, &svc->destinations, n_list) {
3200 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
3202 goto nla_put_failure;
3210 mutex_unlock(&__ip_vs_mutex);
3215 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3216 struct nlattr *nla, int full_entry)
3218 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3219 struct nlattr *nla_addr, *nla_port;
3221 /* Parse mandatory identifying destination fields first */
3223 nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3226 nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
3227 nla_port = attrs[IPVS_DEST_ATTR_PORT];
3229 if (!(nla_addr && nla_port))
3232 memset(udest, 0, sizeof(*udest));
3234 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3235 udest->port = nla_get_u16(nla_port);
3237 /* If a full entry was requested, check for the additional fields */
3239 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3242 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3243 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
3244 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
3245 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
3247 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3250 udest->conn_flags = nla_get_u32(nla_fwd)
3251 & IP_VS_CONN_F_FWD_MASK;
3252 udest->weight = nla_get_u32(nla_weight);
3253 udest->u_threshold = nla_get_u32(nla_u_thresh);
3254 udest->l_threshold = nla_get_u32(nla_l_thresh);
3260 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
3261 const char *mcast_ifn, __be32 syncid)
3263 struct nlattr *nl_daemon;
3265 nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3269 if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
3270 nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
3271 nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
3272 goto nla_put_failure;
3273 nla_nest_end(skb, nl_daemon);
3278 nla_nest_cancel(skb, nl_daemon);
3282 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
3283 const char *mcast_ifn, __be32 syncid,
3284 struct netlink_callback *cb)
3287 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3288 &ip_vs_genl_family, NLM_F_MULTI,
3289 IPVS_CMD_NEW_DAEMON);
3293 if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
3294 goto nla_put_failure;
3296 return genlmsg_end(skb, hdr);
3299 genlmsg_cancel(skb, hdr);
3303 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3304 struct netlink_callback *cb)
3306 struct net *net = skb_sknet(skb);
3307 struct netns_ipvs *ipvs = net_ipvs(net);
3309 mutex_lock(&ipvs->sync_mutex);
3310 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3311 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3312 ipvs->master_mcast_ifn,
3313 ipvs->master_syncid, cb) < 0)
3314 goto nla_put_failure;
3319 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3320 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3321 ipvs->backup_mcast_ifn,
3322 ipvs->backup_syncid, cb) < 0)
3323 goto nla_put_failure;
3329 mutex_unlock(&ipvs->sync_mutex);
3334 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3336 if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3337 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3338 attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3341 return start_sync_thread(net,
3342 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3343 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3344 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3347 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3349 if (!attrs[IPVS_DAEMON_ATTR_STATE])
3352 return stop_sync_thread(net,
3353 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3356 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3358 struct ip_vs_timeout_user t;
3360 __ip_vs_get_timeouts(net, &t);
3362 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3363 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3365 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3367 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3369 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3370 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3372 return ip_vs_set_timeout(net, &t);
3375 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
3379 struct netns_ipvs *ipvs;
3381 net = skb_sknet(skb);
3382 ipvs = net_ipvs(net);
3383 cmd = info->genlhdr->cmd;
3385 if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
3386 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3388 mutex_lock(&ipvs->sync_mutex);
3389 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3390 nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3391 info->attrs[IPVS_CMD_ATTR_DAEMON],
3392 ip_vs_daemon_policy)) {
3397 if (cmd == IPVS_CMD_NEW_DAEMON)
3398 ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3400 ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3402 mutex_unlock(&ipvs->sync_mutex);
3407 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3409 struct ip_vs_service *svc = NULL;
3410 struct ip_vs_service_user_kern usvc;
3411 struct ip_vs_dest_user_kern udest;
3413 int need_full_svc = 0, need_full_dest = 0;
3416 net = skb_sknet(skb);
3417 cmd = info->genlhdr->cmd;
3419 mutex_lock(&__ip_vs_mutex);
3421 if (cmd == IPVS_CMD_FLUSH) {
3422 ret = ip_vs_flush(net);
3424 } else if (cmd == IPVS_CMD_SET_CONFIG) {
3425 ret = ip_vs_genl_set_config(net, info->attrs);
3427 } else if (cmd == IPVS_CMD_ZERO &&
3428 !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3429 ret = ip_vs_zero_all(net);
3433 /* All following commands require a service argument, so check if we
3434 * received a valid one. We need a full service specification when
3435 * adding / editing a service. Only identifying members otherwise. */
3436 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3439 ret = ip_vs_genl_parse_service(net, &usvc,
3440 info->attrs[IPVS_CMD_ATTR_SERVICE],
3441 need_full_svc, &svc);
3445 /* Unless we're adding a new service, the service must already exist */
3446 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3451 /* Destination commands require a valid destination argument. For
3452 * adding / editing a destination, we need a full destination
3454 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3455 cmd == IPVS_CMD_DEL_DEST) {
3456 if (cmd != IPVS_CMD_DEL_DEST)
3459 ret = ip_vs_genl_parse_dest(&udest,
3460 info->attrs[IPVS_CMD_ATTR_DEST],
3467 case IPVS_CMD_NEW_SERVICE:
3469 ret = ip_vs_add_service(net, &usvc, &svc);
3473 case IPVS_CMD_SET_SERVICE:
3474 ret = ip_vs_edit_service(svc, &usvc);
3476 case IPVS_CMD_DEL_SERVICE:
3477 ret = ip_vs_del_service(svc);
3478 /* do not use svc, it can be freed */
3480 case IPVS_CMD_NEW_DEST:
3481 ret = ip_vs_add_dest(svc, &udest);
3483 case IPVS_CMD_SET_DEST:
3484 ret = ip_vs_edit_dest(svc, &udest);
3486 case IPVS_CMD_DEL_DEST:
3487 ret = ip_vs_del_dest(svc, &udest);
3490 ret = ip_vs_zero_service(svc);
3497 mutex_unlock(&__ip_vs_mutex);
3502 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3504 struct sk_buff *msg;
3506 int ret, cmd, reply_cmd;
3509 net = skb_sknet(skb);
3510 cmd = info->genlhdr->cmd;
3512 if (cmd == IPVS_CMD_GET_SERVICE)
3513 reply_cmd = IPVS_CMD_NEW_SERVICE;
3514 else if (cmd == IPVS_CMD_GET_INFO)
3515 reply_cmd = IPVS_CMD_SET_INFO;
3516 else if (cmd == IPVS_CMD_GET_CONFIG)
3517 reply_cmd = IPVS_CMD_SET_CONFIG;
3519 pr_err("unknown Generic Netlink command\n");
3523 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3527 mutex_lock(&__ip_vs_mutex);
3529 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3531 goto nla_put_failure;
3534 case IPVS_CMD_GET_SERVICE:
3536 struct ip_vs_service *svc;
3538 svc = ip_vs_genl_find_service(net,
3539 info->attrs[IPVS_CMD_ATTR_SERVICE]);
3544 ret = ip_vs_genl_fill_service(msg, svc);
3546 goto nla_put_failure;
3555 case IPVS_CMD_GET_CONFIG:
3557 struct ip_vs_timeout_user t;
3559 __ip_vs_get_timeouts(net, &t);
3560 #ifdef CONFIG_IP_VS_PROTO_TCP
3561 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
3563 nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3565 goto nla_put_failure;
3567 #ifdef CONFIG_IP_VS_PROTO_UDP
3568 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
3569 goto nla_put_failure;
3575 case IPVS_CMD_GET_INFO:
3576 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
3577 IP_VS_VERSION_CODE) ||
3578 nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3579 ip_vs_conn_tab_size))
3580 goto nla_put_failure;
3584 genlmsg_end(msg, reply);
3585 ret = genlmsg_reply(msg, info);
3589 pr_err("not enough space in Netlink message\n");
3595 mutex_unlock(&__ip_vs_mutex);
3601 static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3603 .cmd = IPVS_CMD_NEW_SERVICE,
3604 .flags = GENL_ADMIN_PERM,
3605 .policy = ip_vs_cmd_policy,
3606 .doit = ip_vs_genl_set_cmd,
3609 .cmd = IPVS_CMD_SET_SERVICE,
3610 .flags = GENL_ADMIN_PERM,
3611 .policy = ip_vs_cmd_policy,
3612 .doit = ip_vs_genl_set_cmd,
3615 .cmd = IPVS_CMD_DEL_SERVICE,
3616 .flags = GENL_ADMIN_PERM,
3617 .policy = ip_vs_cmd_policy,
3618 .doit = ip_vs_genl_set_cmd,
3621 .cmd = IPVS_CMD_GET_SERVICE,
3622 .flags = GENL_ADMIN_PERM,
3623 .doit = ip_vs_genl_get_cmd,
3624 .dumpit = ip_vs_genl_dump_services,
3625 .policy = ip_vs_cmd_policy,
3628 .cmd = IPVS_CMD_NEW_DEST,
3629 .flags = GENL_ADMIN_PERM,
3630 .policy = ip_vs_cmd_policy,
3631 .doit = ip_vs_genl_set_cmd,
3634 .cmd = IPVS_CMD_SET_DEST,
3635 .flags = GENL_ADMIN_PERM,
3636 .policy = ip_vs_cmd_policy,
3637 .doit = ip_vs_genl_set_cmd,
3640 .cmd = IPVS_CMD_DEL_DEST,
3641 .flags = GENL_ADMIN_PERM,
3642 .policy = ip_vs_cmd_policy,
3643 .doit = ip_vs_genl_set_cmd,
3646 .cmd = IPVS_CMD_GET_DEST,
3647 .flags = GENL_ADMIN_PERM,
3648 .policy = ip_vs_cmd_policy,
3649 .dumpit = ip_vs_genl_dump_dests,
3652 .cmd = IPVS_CMD_NEW_DAEMON,
3653 .flags = GENL_ADMIN_PERM,
3654 .policy = ip_vs_cmd_policy,
3655 .doit = ip_vs_genl_set_daemon,
3658 .cmd = IPVS_CMD_DEL_DAEMON,
3659 .flags = GENL_ADMIN_PERM,
3660 .policy = ip_vs_cmd_policy,
3661 .doit = ip_vs_genl_set_daemon,
3664 .cmd = IPVS_CMD_GET_DAEMON,
3665 .flags = GENL_ADMIN_PERM,
3666 .dumpit = ip_vs_genl_dump_daemons,
3669 .cmd = IPVS_CMD_SET_CONFIG,
3670 .flags = GENL_ADMIN_PERM,
3671 .policy = ip_vs_cmd_policy,
3672 .doit = ip_vs_genl_set_cmd,
3675 .cmd = IPVS_CMD_GET_CONFIG,
3676 .flags = GENL_ADMIN_PERM,
3677 .doit = ip_vs_genl_get_cmd,
3680 .cmd = IPVS_CMD_GET_INFO,
3681 .flags = GENL_ADMIN_PERM,
3682 .doit = ip_vs_genl_get_cmd,
3685 .cmd = IPVS_CMD_ZERO,
3686 .flags = GENL_ADMIN_PERM,
3687 .policy = ip_vs_cmd_policy,
3688 .doit = ip_vs_genl_set_cmd,
3691 .cmd = IPVS_CMD_FLUSH,
3692 .flags = GENL_ADMIN_PERM,
3693 .doit = ip_vs_genl_set_cmd,
3697 static int __init ip_vs_genl_register(void)
3699 return genl_register_family_with_ops(&ip_vs_genl_family,
3700 ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
3703 static void ip_vs_genl_unregister(void)
3705 genl_unregister_family(&ip_vs_genl_family);
3708 /* End of Generic Netlink interface definitions */
3711 * per netns intit/exit func.
3713 #ifdef CONFIG_SYSCTL
3714 static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
3717 struct netns_ipvs *ipvs = net_ipvs(net);
3718 struct ctl_table *tbl;
3720 atomic_set(&ipvs->dropentry, 0);
3721 spin_lock_init(&ipvs->dropentry_lock);
3722 spin_lock_init(&ipvs->droppacket_lock);
3723 spin_lock_init(&ipvs->securetcp_lock);
3725 if (!net_eq(net, &init_net)) {
3726 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3730 /* Don't export sysctls to unprivileged users */
3731 if (net->user_ns != &init_user_ns)
3732 tbl[0].procname = NULL;
3735 /* Initialize sysctl defaults */
3737 ipvs->sysctl_amemthresh = 1024;
3738 tbl[idx++].data = &ipvs->sysctl_amemthresh;
3739 ipvs->sysctl_am_droprate = 10;
3740 tbl[idx++].data = &ipvs->sysctl_am_droprate;
3741 tbl[idx++].data = &ipvs->sysctl_drop_entry;
3742 tbl[idx++].data = &ipvs->sysctl_drop_packet;
3743 #ifdef CONFIG_IP_VS_NFCT
3744 tbl[idx++].data = &ipvs->sysctl_conntrack;
3746 tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3747 ipvs->sysctl_snat_reroute = 1;
3748 tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3749 ipvs->sysctl_sync_ver = 1;
3750 tbl[idx++].data = &ipvs->sysctl_sync_ver;
3751 ipvs->sysctl_sync_ports = 1;
3752 tbl[idx++].data = &ipvs->sysctl_sync_ports;
3753 ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
3754 tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
3755 ipvs->sysctl_sync_sock_size = 0;
3756 tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
3757 tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3758 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3759 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3760 ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
3761 ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
3762 tbl[idx].data = &ipvs->sysctl_sync_threshold;
3763 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3764 ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
3765 tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
3766 ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
3767 tbl[idx++].data = &ipvs->sysctl_sync_retries;
3768 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3769 ipvs->sysctl_pmtu_disc = 1;
3770 tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
3771 tbl[idx++].data = &ipvs->sysctl_backup_only;
3774 ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
3775 if (ipvs->sysctl_hdr == NULL) {
3776 if (!net_eq(net, &init_net))
3780 ip_vs_start_estimator(net, &ipvs->tot_stats);
3781 ipvs->sysctl_tbl = tbl;
3782 /* Schedule defense work */
3783 INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3784 schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3789 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
3791 struct netns_ipvs *ipvs = net_ipvs(net);
3793 cancel_delayed_work_sync(&ipvs->defense_work);
3794 cancel_work_sync(&ipvs->defense_work.work);
3795 unregister_net_sysctl_table(ipvs->sysctl_hdr);
3800 static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
3801 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
3805 static struct notifier_block ip_vs_dst_notifier = {
3806 .notifier_call = ip_vs_dst_event,
3809 int __net_init ip_vs_control_net_init(struct net *net)
3812 struct netns_ipvs *ipvs = net_ipvs(net);
3814 rwlock_init(&ipvs->rs_lock);
3816 /* Initialize rs_table */
3817 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3818 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3820 INIT_LIST_HEAD(&ipvs->dest_trash);
3821 atomic_set(&ipvs->ftpsvc_counter, 0);
3822 atomic_set(&ipvs->nullsvc_counter, 0);
3825 ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3826 if (!ipvs->tot_stats.cpustats)
3829 spin_lock_init(&ipvs->tot_stats.lock);
3831 proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
3832 proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
3833 proc_create("ip_vs_stats_percpu", 0, net->proc_net,
3834 &ip_vs_stats_percpu_fops);
3836 if (ip_vs_control_net_init_sysctl(net))
3842 free_percpu(ipvs->tot_stats.cpustats);
3846 void __net_exit ip_vs_control_net_cleanup(struct net *net)
3848 struct netns_ipvs *ipvs = net_ipvs(net);
3850 ip_vs_trash_cleanup(net);
3851 ip_vs_stop_estimator(net, &ipvs->tot_stats);
3852 ip_vs_control_net_cleanup_sysctl(net);
3853 remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
3854 remove_proc_entry("ip_vs_stats", net->proc_net);
3855 remove_proc_entry("ip_vs", net->proc_net);
3856 free_percpu(ipvs->tot_stats.cpustats);
3859 int __init ip_vs_register_nl_ioctl(void)
3863 ret = nf_register_sockopt(&ip_vs_sockopts);
3865 pr_err("cannot register sockopt.\n");
3869 ret = ip_vs_genl_register();
3871 pr_err("cannot register Generic Netlink interface.\n");
3877 nf_unregister_sockopt(&ip_vs_sockopts);
3882 void ip_vs_unregister_nl_ioctl(void)
3884 ip_vs_genl_unregister();
3885 nf_unregister_sockopt(&ip_vs_sockopts);
3888 int __init ip_vs_control_init(void)
3895 /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
3896 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3897 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3898 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3901 smp_wmb(); /* Do we really need it now ? */
3903 ret = register_netdevice_notifier(&ip_vs_dst_notifier);
3912 void ip_vs_control_cleanup(void)
3915 unregister_netdevice_notifier(&ip_vs_dst_notifier);