2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
57 #include <net/dst_metadata.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
66 #include <asm/uaccess.h>
69 #include <linux/sysctl.h>
73 RT6_NUD_FAIL_HARD = -3,
74 RT6_NUD_FAIL_PROBE = -2,
75 RT6_NUD_FAIL_DO_RR = -1,
79 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
80 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
81 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
82 static unsigned int ip6_mtu(const struct dst_entry *dst);
83 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
84 static void ip6_dst_destroy(struct dst_entry *);
85 static void ip6_dst_ifdown(struct dst_entry *,
86 struct net_device *dev, int how);
87 static int ip6_dst_gc(struct dst_ops *ops);
89 static int ip6_pkt_discard(struct sk_buff *skb);
90 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
91 static int ip6_pkt_prohibit(struct sk_buff *skb);
92 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static void ip6_link_failure(struct sk_buff *skb);
94 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
95 struct sk_buff *skb, u32 mtu);
96 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
99 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct net_device *dev,
103 const struct in6_addr *prefix, int prefixlen,
104 const struct in6_addr *gwaddr, unsigned int pref);
105 static struct rt6_info *rt6_get_route_info(struct net_device *dev,
106 const struct in6_addr *prefix, int prefixlen,
107 const struct in6_addr *gwaddr);
110 struct uncached_list {
112 struct list_head head;
115 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
117 static void rt6_uncached_list_add(struct rt6_info *rt)
119 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
121 rt->dst.flags |= DST_NOCACHE;
122 rt->rt6i_uncached_list = ul;
124 spin_lock_bh(&ul->lock);
125 list_add_tail(&rt->rt6i_uncached, &ul->head);
126 spin_unlock_bh(&ul->lock);
129 static void rt6_uncached_list_del(struct rt6_info *rt)
131 if (!list_empty(&rt->rt6i_uncached)) {
132 struct uncached_list *ul = rt->rt6i_uncached_list;
134 spin_lock_bh(&ul->lock);
135 list_del(&rt->rt6i_uncached);
136 spin_unlock_bh(&ul->lock);
140 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
142 struct net_device *loopback_dev = net->loopback_dev;
145 if (dev == loopback_dev)
148 for_each_possible_cpu(cpu) {
149 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
152 spin_lock_bh(&ul->lock);
153 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
154 struct inet6_dev *rt_idev = rt->rt6i_idev;
155 struct net_device *rt_dev = rt->dst.dev;
157 if (rt_idev->dev == dev) {
158 rt->rt6i_idev = in6_dev_get(loopback_dev);
159 in6_dev_put(rt_idev);
163 rt->dst.dev = loopback_dev;
164 dev_hold(rt->dst.dev);
168 spin_unlock_bh(&ul->lock);
172 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
174 return dst_metrics_write_ptr(rt->dst.from);
177 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
179 struct rt6_info *rt = (struct rt6_info *)dst;
181 if (rt->rt6i_flags & RTF_PCPU)
182 return rt6_pcpu_cow_metrics(rt);
183 else if (rt->rt6i_flags & RTF_CACHE)
186 return dst_cow_metrics_generic(dst, old);
189 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
193 struct in6_addr *p = &rt->rt6i_gateway;
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
206 struct rt6_info *rt = (struct rt6_info *) dst;
209 daddr = choose_neigh_daddr(rt, skb, daddr);
210 n = __ipv6_neigh_lookup(dst->dev, daddr);
213 return neigh_create(&nd_tbl, daddr, dst->dev);
216 static struct dst_ops ip6_dst_ops_template = {
220 .check = ip6_dst_check,
221 .default_advmss = ip6_default_advmss,
223 .cow_metrics = ipv6_cow_metrics,
224 .destroy = ip6_dst_destroy,
225 .ifdown = ip6_dst_ifdown,
226 .negative_advice = ip6_negative_advice,
227 .link_failure = ip6_link_failure,
228 .update_pmtu = ip6_rt_update_pmtu,
229 .redirect = rt6_do_redirect,
230 .local_out = __ip6_local_out,
231 .neigh_lookup = ip6_neigh_lookup,
234 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
236 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
238 return mtu ? : dst->dev->mtu;
241 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
242 struct sk_buff *skb, u32 mtu)
246 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
251 static struct dst_ops ip6_dst_blackhole_ops = {
253 .destroy = ip6_dst_destroy,
254 .check = ip6_dst_check,
255 .mtu = ip6_blackhole_mtu,
256 .default_advmss = ip6_default_advmss,
257 .update_pmtu = ip6_rt_blackhole_update_pmtu,
258 .redirect = ip6_rt_blackhole_redirect,
259 .cow_metrics = dst_cow_metrics_generic,
260 .neigh_lookup = ip6_neigh_lookup,
263 static const u32 ip6_template_metrics[RTAX_MAX] = {
264 [RTAX_HOPLIMIT - 1] = 0,
267 static const struct rt6_info ip6_null_entry_template = {
269 .__refcnt = ATOMIC_INIT(1),
271 .obsolete = DST_OBSOLETE_FORCE_CHK,
272 .error = -ENETUNREACH,
273 .input = ip6_pkt_discard,
274 .output = ip6_pkt_discard_out,
276 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
277 .rt6i_protocol = RTPROT_KERNEL,
278 .rt6i_metric = ~(u32) 0,
279 .rt6i_ref = ATOMIC_INIT(1),
282 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
284 static const struct rt6_info ip6_prohibit_entry_template = {
286 .__refcnt = ATOMIC_INIT(1),
288 .obsolete = DST_OBSOLETE_FORCE_CHK,
290 .input = ip6_pkt_prohibit,
291 .output = ip6_pkt_prohibit_out,
293 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .rt6i_protocol = RTPROT_KERNEL,
295 .rt6i_metric = ~(u32) 0,
296 .rt6i_ref = ATOMIC_INIT(1),
299 static const struct rt6_info ip6_blk_hole_entry_template = {
301 .__refcnt = ATOMIC_INIT(1),
303 .obsolete = DST_OBSOLETE_FORCE_CHK,
305 .input = dst_discard,
306 .output = dst_discard_out,
308 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
309 .rt6i_protocol = RTPROT_KERNEL,
310 .rt6i_metric = ~(u32) 0,
311 .rt6i_ref = ATOMIC_INIT(1),
316 static void rt6_info_init(struct rt6_info *rt)
318 struct dst_entry *dst = &rt->dst;
320 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
321 INIT_LIST_HEAD(&rt->rt6i_siblings);
322 INIT_LIST_HEAD(&rt->rt6i_uncached);
325 /* allocate dst with ip6_dst_ops */
326 static struct rt6_info *__ip6_dst_alloc(struct net *net,
327 struct net_device *dev,
330 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
331 0, DST_OBSOLETE_FORCE_CHK, flags);
339 static struct rt6_info *ip6_dst_alloc(struct net *net,
340 struct net_device *dev,
343 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
346 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
350 for_each_possible_cpu(cpu) {
353 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
354 /* no one shares rt */
358 dst_destroy((struct dst_entry *)rt);
366 static void ip6_dst_destroy(struct dst_entry *dst)
368 struct rt6_info *rt = (struct rt6_info *)dst;
369 struct dst_entry *from = dst->from;
370 struct inet6_dev *idev;
372 dst_destroy_metrics_generic(dst);
373 free_percpu(rt->rt6i_pcpu);
374 rt6_uncached_list_del(rt);
376 idev = rt->rt6i_idev;
378 rt->rt6i_idev = NULL;
386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
389 struct rt6_info *rt = (struct rt6_info *)dst;
390 struct inet6_dev *idev = rt->rt6i_idev;
391 struct net_device *loopback_dev =
392 dev_net(dev)->loopback_dev;
394 if (dev != loopback_dev) {
395 if (idev && idev->dev == dev) {
396 struct inet6_dev *loopback_idev =
397 in6_dev_get(loopback_dev);
399 rt->rt6i_idev = loopback_idev;
406 static bool __rt6_check_expired(const struct rt6_info *rt)
408 if (rt->rt6i_flags & RTF_EXPIRES)
409 return time_after(jiffies, rt->dst.expires);
414 static bool rt6_check_expired(const struct rt6_info *rt)
416 if (rt->rt6i_flags & RTF_EXPIRES) {
417 if (time_after(jiffies, rt->dst.expires))
419 } else if (rt->dst.from) {
420 return rt6_check_expired((struct rt6_info *) rt->dst.from);
425 /* Multipath route selection:
426 * Hash based function using packet header and flowlabel.
427 * Adapted from fib_info_hashfn()
429 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
430 const struct flowi6 *fl6)
432 return get_hash_from_flowi6(fl6) % candidate_count;
435 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
436 struct flowi6 *fl6, int oif,
439 struct rt6_info *sibling, *next_sibling;
442 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
443 /* Don't change the route, if route_choosen == 0
444 * (siblings does not include ourself)
447 list_for_each_entry_safe(sibling, next_sibling,
448 &match->rt6i_siblings, rt6i_siblings) {
450 if (route_choosen == 0) {
451 if (rt6_score_route(sibling, oif, strict) < 0)
461 * Route lookup. Any table->tb6_lock is implied.
464 static inline struct rt6_info *rt6_device_match(struct net *net,
466 const struct in6_addr *saddr,
470 struct rt6_info *local = NULL;
471 struct rt6_info *sprt;
473 if (!oif && ipv6_addr_any(saddr))
476 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
477 struct net_device *dev = sprt->dst.dev;
480 if (dev->ifindex == oif)
482 if (dev->flags & IFF_LOOPBACK) {
483 if (!sprt->rt6i_idev ||
484 sprt->rt6i_idev->dev->ifindex != oif) {
485 if (flags & RT6_LOOKUP_F_IFACE)
488 local->rt6i_idev->dev->ifindex == oif)
494 if (ipv6_chk_addr(net, saddr, dev,
495 flags & RT6_LOOKUP_F_IFACE))
504 if (flags & RT6_LOOKUP_F_IFACE)
505 return net->ipv6.ip6_null_entry;
511 #ifdef CONFIG_IPV6_ROUTER_PREF
512 struct __rt6_probe_work {
513 struct work_struct work;
514 struct in6_addr target;
515 struct net_device *dev;
518 static void rt6_probe_deferred(struct work_struct *w)
520 struct in6_addr mcaddr;
521 struct __rt6_probe_work *work =
522 container_of(w, struct __rt6_probe_work, work);
524 addrconf_addr_solict_mult(&work->target, &mcaddr);
525 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
530 static void rt6_probe(struct rt6_info *rt)
532 struct __rt6_probe_work *work;
533 struct neighbour *neigh;
535 * Okay, this does not seem to be appropriate
536 * for now, however, we need to check if it
537 * is really so; aka Router Reachability Probing.
539 * Router Reachability Probe MUST be rate-limited
540 * to no more than one per minute.
542 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
545 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
547 if (neigh->nud_state & NUD_VALID)
551 write_lock(&neigh->lock);
552 if (!(neigh->nud_state & NUD_VALID) &&
555 rt->rt6i_idev->cnf.rtr_probe_interval)) {
556 work = kmalloc(sizeof(*work), GFP_ATOMIC);
558 __neigh_set_probe_once(neigh);
560 write_unlock(&neigh->lock);
562 work = kmalloc(sizeof(*work), GFP_ATOMIC);
566 INIT_WORK(&work->work, rt6_probe_deferred);
567 work->target = rt->rt6i_gateway;
568 dev_hold(rt->dst.dev);
569 work->dev = rt->dst.dev;
570 schedule_work(&work->work);
574 rcu_read_unlock_bh();
577 static inline void rt6_probe(struct rt6_info *rt)
583 * Default Router Selection (RFC 2461 6.3.6)
585 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
587 struct net_device *dev = rt->dst.dev;
588 if (!oif || dev->ifindex == oif)
590 if ((dev->flags & IFF_LOOPBACK) &&
591 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
596 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
598 struct neighbour *neigh;
599 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
601 if (rt->rt6i_flags & RTF_NONEXTHOP ||
602 !(rt->rt6i_flags & RTF_GATEWAY))
603 return RT6_NUD_SUCCEED;
606 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
608 read_lock(&neigh->lock);
609 if (neigh->nud_state & NUD_VALID)
610 ret = RT6_NUD_SUCCEED;
611 #ifdef CONFIG_IPV6_ROUTER_PREF
612 else if (!(neigh->nud_state & NUD_FAILED))
613 ret = RT6_NUD_SUCCEED;
615 ret = RT6_NUD_FAIL_PROBE;
617 read_unlock(&neigh->lock);
619 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
620 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
622 rcu_read_unlock_bh();
627 static int rt6_score_route(struct rt6_info *rt, int oif,
632 m = rt6_check_dev(rt, oif);
633 if (!m && (strict & RT6_LOOKUP_F_IFACE))
634 return RT6_NUD_FAIL_HARD;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
638 if (strict & RT6_LOOKUP_F_REACHABLE) {
639 int n = rt6_check_neigh(rt);
646 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
647 int *mpri, struct rt6_info *match,
651 bool match_do_rr = false;
652 struct inet6_dev *idev = rt->rt6i_idev;
653 struct net_device *dev = rt->dst.dev;
655 if (dev && !netif_carrier_ok(dev) &&
656 idev->cnf.ignore_routes_with_linkdown)
659 if (rt6_check_expired(rt))
662 m = rt6_score_route(rt, oif, strict);
663 if (m == RT6_NUD_FAIL_DO_RR) {
665 m = 0; /* lowest valid score */
666 } else if (m == RT6_NUD_FAIL_HARD) {
670 if (strict & RT6_LOOKUP_F_REACHABLE)
673 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
675 *do_rr = match_do_rr;
683 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
684 struct rt6_info *rr_head,
685 u32 metric, int oif, int strict,
688 struct rt6_info *rt, *match, *cont;
693 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
694 if (rt->rt6i_metric != metric) {
699 match = find_match(rt, oif, strict, &mpri, match, do_rr);
702 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
703 if (rt->rt6i_metric != metric) {
708 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714 for (rt = cont; rt; rt = rt->dst.rt6_next)
715 match = find_match(rt, oif, strict, &mpri, match, do_rr);
720 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
722 struct rt6_info *match, *rt0;
728 fn->rr_ptr = rt0 = fn->leaf;
730 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
734 struct rt6_info *next = rt0->dst.rt6_next;
736 /* no entries matched; do round-robin */
737 if (!next || next->rt6i_metric != rt0->rt6i_metric)
744 net = dev_net(rt0->dst.dev);
745 return match ? match : net->ipv6.ip6_null_entry;
748 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
750 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
753 #ifdef CONFIG_IPV6_ROUTE_INFO
754 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
755 const struct in6_addr *gwaddr)
757 struct route_info *rinfo = (struct route_info *) opt;
758 struct in6_addr prefix_buf, *prefix;
760 unsigned long lifetime;
763 if (len < sizeof(struct route_info)) {
767 /* Sanity check for prefix_len and length */
768 if (rinfo->length > 3) {
770 } else if (rinfo->prefix_len > 128) {
772 } else if (rinfo->prefix_len > 64) {
773 if (rinfo->length < 2) {
776 } else if (rinfo->prefix_len > 0) {
777 if (rinfo->length < 1) {
782 pref = rinfo->route_pref;
783 if (pref == ICMPV6_ROUTER_PREF_INVALID)
786 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
788 if (rinfo->length == 3)
789 prefix = (struct in6_addr *)rinfo->prefix;
791 /* this function is safe */
792 ipv6_addr_prefix(&prefix_buf,
793 (struct in6_addr *)rinfo->prefix,
795 prefix = &prefix_buf;
798 if (rinfo->prefix_len == 0)
799 rt = rt6_get_dflt_router(gwaddr, dev);
801 rt = rt6_get_route_info(dev, prefix, rinfo->prefix_len, gwaddr);
803 if (rt && !lifetime) {
809 rt = rt6_add_route_info(dev, prefix, rinfo->prefix_len, gwaddr, pref);
811 rt->rt6i_flags = RTF_ROUTEINFO |
812 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
815 if (!addrconf_finite_timeout(lifetime))
816 rt6_clean_expires(rt);
818 rt6_set_expires(rt, jiffies + HZ * lifetime);
826 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
827 struct in6_addr *saddr)
829 struct fib6_node *pn;
831 if (fn->fn_flags & RTN_TL_ROOT)
834 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
835 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
838 if (fn->fn_flags & RTN_RTINFO)
843 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
844 struct fib6_table *table,
845 struct flowi6 *fl6, int flags)
847 struct fib6_node *fn;
850 read_lock_bh(&table->tb6_lock);
851 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
854 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
855 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
856 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
857 if (rt == net->ipv6.ip6_null_entry) {
858 fn = fib6_backtrack(fn, &fl6->saddr);
862 dst_use(&rt->dst, jiffies);
863 read_unlock_bh(&table->tb6_lock);
868 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
871 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
873 EXPORT_SYMBOL_GPL(ip6_route_lookup);
875 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
876 const struct in6_addr *saddr, int oif, int strict)
878 struct flowi6 fl6 = {
882 struct dst_entry *dst;
883 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
886 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
887 flags |= RT6_LOOKUP_F_HAS_SADDR;
890 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
892 return (struct rt6_info *) dst;
898 EXPORT_SYMBOL(rt6_lookup);
900 /* ip6_ins_rt is called with FREE table->tb6_lock.
901 It takes new route entry, the addition fails by any reason the
902 route is freed. In any case, if caller does not hold it, it may
906 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
907 struct mx6_config *mxc)
910 struct fib6_table *table;
912 table = rt->rt6i_table;
913 write_lock_bh(&table->tb6_lock);
914 err = fib6_add(&table->tb6_root, rt, info, mxc);
915 write_unlock_bh(&table->tb6_lock);
920 int ip6_ins_rt(struct rt6_info *rt)
922 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
923 struct mx6_config mxc = { .mx = NULL, };
925 return __ip6_ins_rt(rt, &info, &mxc);
928 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
929 const struct in6_addr *daddr,
930 const struct in6_addr *saddr)
938 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
939 ort = (struct rt6_info *)ort->dst.from;
941 rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
946 ip6_rt_copy_init(rt, ort);
947 rt->rt6i_flags |= RTF_CACHE;
949 rt->dst.flags |= DST_HOST;
950 rt->rt6i_dst.addr = *daddr;
951 rt->rt6i_dst.plen = 128;
953 if (!rt6_is_gw_or_nonexthop(ort)) {
954 if (ort->rt6i_dst.plen != 128 &&
955 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
956 rt->rt6i_flags |= RTF_ANYCAST;
957 #ifdef CONFIG_IPV6_SUBTREES
958 if (rt->rt6i_src.plen && saddr) {
959 rt->rt6i_src.addr = *saddr;
960 rt->rt6i_src.plen = 128;
968 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
970 struct rt6_info *pcpu_rt;
972 pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
973 rt->dst.dev, rt->dst.flags);
977 ip6_rt_copy_init(pcpu_rt, rt);
978 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
979 pcpu_rt->rt6i_flags |= RTF_PCPU;
983 /* It should be called with read_lock_bh(&tb6_lock) acquired */
984 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
986 struct rt6_info *pcpu_rt, **p;
988 p = this_cpu_ptr(rt->rt6i_pcpu);
992 dst_hold(&pcpu_rt->dst);
993 rt6_dst_from_metrics_check(pcpu_rt);
998 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1000 struct fib6_table *table = rt->rt6i_table;
1001 struct rt6_info *pcpu_rt, *prev, **p;
1003 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1005 struct net *net = dev_net(rt->dst.dev);
1007 dst_hold(&net->ipv6.ip6_null_entry->dst);
1008 return net->ipv6.ip6_null_entry;
1011 read_lock_bh(&table->tb6_lock);
1012 if (rt->rt6i_pcpu) {
1013 p = this_cpu_ptr(rt->rt6i_pcpu);
1014 prev = cmpxchg(p, NULL, pcpu_rt);
1016 /* If someone did it before us, return prev instead */
1017 dst_destroy(&pcpu_rt->dst);
1021 /* rt has been removed from the fib6 tree
1022 * before we have a chance to acquire the read_lock.
1023 * In this case, don't brother to create a pcpu rt
1024 * since rt is going away anyway. The next
1025 * dst_check() will trigger a re-lookup.
1027 dst_destroy(&pcpu_rt->dst);
1030 dst_hold(&pcpu_rt->dst);
1031 rt6_dst_from_metrics_check(pcpu_rt);
1032 read_unlock_bh(&table->tb6_lock);
1036 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1037 struct flowi6 *fl6, int flags)
1039 struct fib6_node *fn, *saved_fn;
1040 struct rt6_info *rt;
1043 strict |= flags & RT6_LOOKUP_F_IFACE;
1044 if (net->ipv6.devconf_all->forwarding == 0)
1045 strict |= RT6_LOOKUP_F_REACHABLE;
1047 read_lock_bh(&table->tb6_lock);
1049 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1052 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1056 rt = rt6_select(fn, oif, strict);
1057 if (rt->rt6i_nsiblings)
1058 rt = rt6_multipath_select(rt, fl6, oif, strict);
1059 if (rt == net->ipv6.ip6_null_entry) {
1060 fn = fib6_backtrack(fn, &fl6->saddr);
1062 goto redo_rt6_select;
1063 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1064 /* also consider unreachable route */
1065 strict &= ~RT6_LOOKUP_F_REACHABLE;
1067 goto redo_rt6_select;
1072 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1073 dst_use(&rt->dst, jiffies);
1074 read_unlock_bh(&table->tb6_lock);
1076 rt6_dst_from_metrics_check(rt);
1078 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1079 !(rt->rt6i_flags & RTF_GATEWAY))) {
1080 /* Create a RTF_CACHE clone which will not be
1081 * owned by the fib6 tree. It is for the special case where
1082 * the daddr in the skb during the neighbor look-up is different
1083 * from the fl6->daddr used to look-up route here.
1086 struct rt6_info *uncached_rt;
1088 dst_use(&rt->dst, jiffies);
1089 read_unlock_bh(&table->tb6_lock);
1091 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1092 dst_release(&rt->dst);
1095 rt6_uncached_list_add(uncached_rt);
1097 uncached_rt = net->ipv6.ip6_null_entry;
1099 dst_hold(&uncached_rt->dst);
1103 /* Get a percpu copy */
1105 struct rt6_info *pcpu_rt;
1107 rt->dst.lastuse = jiffies;
1109 pcpu_rt = rt6_get_pcpu_route(rt);
1112 read_unlock_bh(&table->tb6_lock);
1114 /* We have to do the read_unlock first
1115 * because rt6_make_pcpu_route() may trigger
1116 * ip6_dst_gc() which will take the write_lock.
1119 read_unlock_bh(&table->tb6_lock);
1120 pcpu_rt = rt6_make_pcpu_route(rt);
1121 dst_release(&rt->dst);
1129 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1130 struct flowi6 *fl6, int flags)
1132 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1135 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1136 struct net_device *dev,
1137 struct flowi6 *fl6, int flags)
1139 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1140 flags |= RT6_LOOKUP_F_IFACE;
1142 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1145 void ip6_route_input(struct sk_buff *skb)
1147 const struct ipv6hdr *iph = ipv6_hdr(skb);
1148 struct net *net = dev_net(skb->dev);
1149 int flags = RT6_LOOKUP_F_HAS_SADDR;
1150 struct ip_tunnel_info *tun_info;
1151 struct flowi6 fl6 = {
1152 .flowi6_iif = l3mdev_fib_oif(skb->dev),
1153 .daddr = iph->daddr,
1154 .saddr = iph->saddr,
1155 .flowlabel = ip6_flowinfo(iph),
1156 .flowi6_mark = skb->mark,
1157 .flowi6_proto = iph->nexthdr,
1160 tun_info = skb_tunnel_info(skb);
1161 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1162 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1164 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1167 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1168 struct flowi6 *fl6, int flags)
1170 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1173 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1176 struct dst_entry *dst;
1180 dst = l3mdev_rt6_dst_by_oif(net, fl6);
1184 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1186 any_src = ipv6_addr_any(&fl6->saddr);
1187 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1188 (fl6->flowi6_oif && any_src))
1189 flags |= RT6_LOOKUP_F_IFACE;
1192 flags |= RT6_LOOKUP_F_HAS_SADDR;
1194 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1196 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1198 EXPORT_SYMBOL(ip6_route_output);
1200 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1202 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1203 struct dst_entry *new = NULL;
1205 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1211 new->input = dst_discard;
1212 new->output = dst_discard_out;
1214 dst_copy_metrics(new, &ort->dst);
1215 rt->rt6i_idev = ort->rt6i_idev;
1217 in6_dev_hold(rt->rt6i_idev);
1219 rt->rt6i_gateway = ort->rt6i_gateway;
1220 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1221 rt->rt6i_metric = 0;
1223 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1224 #ifdef CONFIG_IPV6_SUBTREES
1225 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1231 dst_release(dst_orig);
1232 return new ? new : ERR_PTR(-ENOMEM);
1236 * Destination cache support functions
1239 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1242 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1243 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1246 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1248 if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1251 if (rt6_check_expired(rt))
1257 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1259 if (!__rt6_check_expired(rt) &&
1260 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1261 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1267 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1269 struct rt6_info *rt;
1271 rt = (struct rt6_info *) dst;
1273 /* All IPV6 dsts are created with ->obsolete set to the value
1274 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1275 * into this function always.
1278 rt6_dst_from_metrics_check(rt);
1280 if (rt->rt6i_flags & RTF_PCPU ||
1281 (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1282 return rt6_dst_from_check(rt, cookie);
1284 return rt6_check(rt, cookie);
1287 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1289 struct rt6_info *rt = (struct rt6_info *) dst;
1292 if (rt->rt6i_flags & RTF_CACHE) {
1293 if (rt6_check_expired(rt)) {
1305 static void ip6_link_failure(struct sk_buff *skb)
1307 struct rt6_info *rt;
1309 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1311 rt = (struct rt6_info *) skb_dst(skb);
1313 if (rt->rt6i_flags & RTF_CACHE) {
1316 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1317 rt->rt6i_node->fn_sernum = -1;
1322 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1324 struct net *net = dev_net(rt->dst.dev);
1326 rt->rt6i_flags |= RTF_MODIFIED;
1327 rt->rt6i_pmtu = mtu;
1328 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1331 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1333 return !(rt->rt6i_flags & RTF_CACHE) &&
1334 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1337 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1338 const struct ipv6hdr *iph, u32 mtu)
1340 struct rt6_info *rt6 = (struct rt6_info *)dst;
1342 if (rt6->rt6i_flags & RTF_LOCAL)
1346 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1347 if (mtu >= dst_mtu(dst))
1350 if (!rt6_cache_allowed_for_pmtu(rt6)) {
1351 rt6_do_update_pmtu(rt6, mtu);
1353 const struct in6_addr *daddr, *saddr;
1354 struct rt6_info *nrt6;
1357 daddr = &iph->daddr;
1358 saddr = &iph->saddr;
1360 daddr = &sk->sk_v6_daddr;
1361 saddr = &inet6_sk(sk)->saddr;
1365 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1367 rt6_do_update_pmtu(nrt6, mtu);
1369 /* ip6_ins_rt(nrt6) will bump the
1370 * rt6->rt6i_node->fn_sernum
1371 * which will fail the next rt6_check() and
1372 * invalidate the sk->sk_dst_cache.
1379 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1380 struct sk_buff *skb, u32 mtu)
1382 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1385 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1386 int oif, u32 mark, kuid_t uid)
1388 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1389 struct dst_entry *dst;
1392 memset(&fl6, 0, sizeof(fl6));
1393 fl6.flowi6_oif = oif;
1394 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1395 fl6.daddr = iph->daddr;
1396 fl6.saddr = iph->saddr;
1397 fl6.flowlabel = ip6_flowinfo(iph);
1398 fl6.flowi6_uid = uid;
1400 dst = ip6_route_output(net, NULL, &fl6);
1402 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1405 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1407 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1409 ip6_update_pmtu(skb, sock_net(sk), mtu,
1410 sk->sk_bound_dev_if, sk->sk_mark, sock_i_uid(sk));
1412 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1414 /* Handle redirects */
1415 struct ip6rd_flowi {
1417 struct in6_addr gateway;
1420 static struct rt6_info *__ip6_route_redirect(struct net *net,
1421 struct fib6_table *table,
1425 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1426 struct rt6_info *rt;
1427 struct fib6_node *fn;
1429 /* Get the "current" route for this destination and
1430 * check if the redirect has come from approriate router.
1432 * RFC 4861 specifies that redirects should only be
1433 * accepted if they come from the nexthop to the target.
1434 * Due to the way the routes are chosen, this notion
1435 * is a bit fuzzy and one might need to check all possible
1439 read_lock_bh(&table->tb6_lock);
1440 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1442 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1443 if (rt6_check_expired(rt))
1447 if (!(rt->rt6i_flags & RTF_GATEWAY))
1449 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1451 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1457 rt = net->ipv6.ip6_null_entry;
1458 else if (rt->dst.error) {
1459 rt = net->ipv6.ip6_null_entry;
1463 if (rt == net->ipv6.ip6_null_entry) {
1464 fn = fib6_backtrack(fn, &fl6->saddr);
1472 read_unlock_bh(&table->tb6_lock);
1477 static struct dst_entry *ip6_route_redirect(struct net *net,
1478 const struct flowi6 *fl6,
1479 const struct in6_addr *gateway)
1481 int flags = RT6_LOOKUP_F_HAS_SADDR;
1482 struct ip6rd_flowi rdfl;
1485 rdfl.gateway = *gateway;
1487 return fib6_rule_lookup(net, &rdfl.fl6,
1488 flags, __ip6_route_redirect);
1491 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1493 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1494 struct dst_entry *dst;
1497 memset(&fl6, 0, sizeof(fl6));
1498 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1499 fl6.flowi6_oif = oif;
1500 fl6.flowi6_mark = mark;
1501 fl6.daddr = iph->daddr;
1502 fl6.saddr = iph->saddr;
1503 fl6.flowlabel = ip6_flowinfo(iph);
1505 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1506 rt6_do_redirect(dst, NULL, skb);
1509 EXPORT_SYMBOL_GPL(ip6_redirect);
1511 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1514 const struct ipv6hdr *iph = ipv6_hdr(skb);
1515 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1516 struct dst_entry *dst;
1519 memset(&fl6, 0, sizeof(fl6));
1520 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1521 fl6.flowi6_oif = oif;
1522 fl6.flowi6_mark = mark;
1523 fl6.daddr = msg->dest;
1524 fl6.saddr = iph->daddr;
1526 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1527 rt6_do_redirect(dst, NULL, skb);
1531 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1533 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1535 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1537 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1539 struct net_device *dev = dst->dev;
1540 unsigned int mtu = dst_mtu(dst);
1541 struct net *net = dev_net(dev);
1543 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1545 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1546 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1549 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1550 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1551 * IPV6_MAXPLEN is also valid and means: "any MSS,
1552 * rely only on pmtu discovery"
1554 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1559 static unsigned int ip6_mtu(const struct dst_entry *dst)
1561 const struct rt6_info *rt = (const struct rt6_info *)dst;
1562 unsigned int mtu = rt->rt6i_pmtu;
1563 struct inet6_dev *idev;
1568 mtu = dst_metric_raw(dst, RTAX_MTU);
1575 idev = __in6_dev_get(dst->dev);
1577 mtu = idev->cnf.mtu6;
1581 return min_t(unsigned int, mtu, IP6_MAX_MTU);
1584 static struct dst_entry *icmp6_dst_gc_list;
1585 static DEFINE_SPINLOCK(icmp6_dst_lock);
1587 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1590 struct dst_entry *dst;
1591 struct rt6_info *rt;
1592 struct inet6_dev *idev = in6_dev_get(dev);
1593 struct net *net = dev_net(dev);
1595 if (unlikely(!idev))
1596 return ERR_PTR(-ENODEV);
1598 rt = ip6_dst_alloc(net, dev, 0);
1599 if (unlikely(!rt)) {
1601 dst = ERR_PTR(-ENOMEM);
1605 rt->dst.flags |= DST_HOST;
1606 rt->dst.output = ip6_output;
1607 atomic_set(&rt->dst.__refcnt, 1);
1608 rt->rt6i_gateway = fl6->daddr;
1609 rt->rt6i_dst.addr = fl6->daddr;
1610 rt->rt6i_dst.plen = 128;
1611 rt->rt6i_idev = idev;
1612 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1614 spin_lock_bh(&icmp6_dst_lock);
1615 rt->dst.next = icmp6_dst_gc_list;
1616 icmp6_dst_gc_list = &rt->dst;
1617 spin_unlock_bh(&icmp6_dst_lock);
1619 fib6_force_start_gc(net);
1621 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1627 int icmp6_dst_gc(void)
1629 struct dst_entry *dst, **pprev;
1632 spin_lock_bh(&icmp6_dst_lock);
1633 pprev = &icmp6_dst_gc_list;
1635 while ((dst = *pprev) != NULL) {
1636 if (!atomic_read(&dst->__refcnt)) {
1645 spin_unlock_bh(&icmp6_dst_lock);
1650 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1653 struct dst_entry *dst, **pprev;
1655 spin_lock_bh(&icmp6_dst_lock);
1656 pprev = &icmp6_dst_gc_list;
1657 while ((dst = *pprev) != NULL) {
1658 struct rt6_info *rt = (struct rt6_info *) dst;
1659 if (func(rt, arg)) {
1666 spin_unlock_bh(&icmp6_dst_lock);
1669 static int ip6_dst_gc(struct dst_ops *ops)
1671 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1672 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1673 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1674 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1675 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1676 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1679 entries = dst_entries_get_fast(ops);
1680 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1681 entries <= rt_max_size)
1684 net->ipv6.ip6_rt_gc_expire++;
1685 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1686 entries = dst_entries_get_slow(ops);
1687 if (entries < ops->gc_thresh)
1688 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1690 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1691 return entries > rt_max_size;
1694 static int ip6_convert_metrics(struct mx6_config *mxc,
1695 const struct fib6_config *cfg)
1697 bool ecn_ca = false;
1705 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1709 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1710 int type = nla_type(nla);
1715 if (unlikely(type > RTAX_MAX))
1718 if (type == RTAX_CC_ALGO) {
1719 char tmp[TCP_CA_NAME_MAX];
1721 nla_strlcpy(tmp, nla, sizeof(tmp));
1722 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1723 if (val == TCP_CA_UNSPEC)
1726 val = nla_get_u32(nla);
1728 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1732 __set_bit(type - 1, mxc->mx_valid);
1736 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1737 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1747 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1749 struct net *net = cfg->fc_nlinfo.nl_net;
1750 struct rt6_info *rt = NULL;
1751 struct net_device *dev = NULL;
1752 struct inet6_dev *idev = NULL;
1753 struct fib6_table *table;
1757 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1759 #ifndef CONFIG_IPV6_SUBTREES
1760 if (cfg->fc_src_len)
1763 if (cfg->fc_ifindex) {
1765 dev = dev_get_by_index(net, cfg->fc_ifindex);
1768 idev = in6_dev_get(dev);
1773 if (cfg->fc_metric == 0)
1774 cfg->fc_metric = IP6_RT_PRIO_USER;
1777 if (cfg->fc_nlinfo.nlh &&
1778 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1779 table = fib6_get_table(net, cfg->fc_table);
1781 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1782 table = fib6_new_table(net, cfg->fc_table);
1785 table = fib6_new_table(net, cfg->fc_table);
1791 rt = ip6_dst_alloc(net, NULL,
1792 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1799 if (cfg->fc_flags & RTF_EXPIRES)
1800 rt6_set_expires(rt, jiffies +
1801 clock_t_to_jiffies(cfg->fc_expires));
1803 rt6_clean_expires(rt);
1805 if (cfg->fc_protocol == RTPROT_UNSPEC)
1806 cfg->fc_protocol = RTPROT_BOOT;
1807 rt->rt6i_protocol = cfg->fc_protocol;
1809 addr_type = ipv6_addr_type(&cfg->fc_dst);
1811 if (addr_type & IPV6_ADDR_MULTICAST)
1812 rt->dst.input = ip6_mc_input;
1813 else if (cfg->fc_flags & RTF_LOCAL)
1814 rt->dst.input = ip6_input;
1816 rt->dst.input = ip6_forward;
1818 rt->dst.output = ip6_output;
1820 if (cfg->fc_encap) {
1821 struct lwtunnel_state *lwtstate;
1823 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1824 cfg->fc_encap, AF_INET6, cfg,
1828 rt->dst.lwtstate = lwtstate_get(lwtstate);
1829 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1830 rt->dst.lwtstate->orig_output = rt->dst.output;
1831 rt->dst.output = lwtunnel_output;
1833 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1834 rt->dst.lwtstate->orig_input = rt->dst.input;
1835 rt->dst.input = lwtunnel_input;
1839 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1840 rt->rt6i_dst.plen = cfg->fc_dst_len;
1841 if (rt->rt6i_dst.plen == 128)
1842 rt->dst.flags |= DST_HOST;
1844 #ifdef CONFIG_IPV6_SUBTREES
1845 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1846 rt->rt6i_src.plen = cfg->fc_src_len;
1849 rt->rt6i_metric = cfg->fc_metric;
1851 /* We cannot add true routes via loopback here,
1852 they would result in kernel looping; promote them to reject routes
1854 if ((cfg->fc_flags & RTF_REJECT) ||
1855 (dev && (dev->flags & IFF_LOOPBACK) &&
1856 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1857 !(cfg->fc_flags & RTF_LOCAL))) {
1858 /* hold loopback dev/idev if we haven't done so. */
1859 if (dev != net->loopback_dev) {
1864 dev = net->loopback_dev;
1866 idev = in6_dev_get(dev);
1872 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1873 switch (cfg->fc_type) {
1875 rt->dst.error = -EINVAL;
1876 rt->dst.output = dst_discard_out;
1877 rt->dst.input = dst_discard;
1880 rt->dst.error = -EACCES;
1881 rt->dst.output = ip6_pkt_prohibit_out;
1882 rt->dst.input = ip6_pkt_prohibit;
1885 case RTN_UNREACHABLE:
1887 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1888 : (cfg->fc_type == RTN_UNREACHABLE)
1889 ? -EHOSTUNREACH : -ENETUNREACH;
1890 rt->dst.output = ip6_pkt_discard_out;
1891 rt->dst.input = ip6_pkt_discard;
1897 if (cfg->fc_flags & RTF_GATEWAY) {
1898 const struct in6_addr *gw_addr;
1901 gw_addr = &cfg->fc_gateway;
1902 gwa_type = ipv6_addr_type(gw_addr);
1904 /* if gw_addr is local we will fail to detect this in case
1905 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1906 * will return already-added prefix route via interface that
1907 * prefix route was assigned to, which might be non-loopback.
1910 if (ipv6_chk_addr_and_flags(net, gw_addr,
1911 gwa_type & IPV6_ADDR_LINKLOCAL ?
1915 rt->rt6i_gateway = *gw_addr;
1917 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1918 struct rt6_info *grt;
1920 /* IPv6 strictly inhibits using not link-local
1921 addresses as nexthop address.
1922 Otherwise, router will not able to send redirects.
1923 It is very good, but in some (rare!) circumstances
1924 (SIT, PtP, NBMA NOARP links) it is handy to allow
1925 some exceptions. --ANK
1927 if (!(gwa_type & IPV6_ADDR_UNICAST))
1930 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1932 err = -EHOSTUNREACH;
1936 if (dev != grt->dst.dev) {
1942 idev = grt->rt6i_idev;
1944 in6_dev_hold(grt->rt6i_idev);
1946 if (!(grt->rt6i_flags & RTF_GATEWAY))
1954 if (!dev || (dev->flags & IFF_LOOPBACK))
1962 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1963 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1967 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1968 rt->rt6i_prefsrc.plen = 128;
1970 rt->rt6i_prefsrc.plen = 0;
1972 rt->rt6i_flags = cfg->fc_flags;
1976 rt->rt6i_idev = idev;
1977 rt->rt6i_table = table;
1979 cfg->fc_nlinfo.nl_net = dev_net(dev);
1990 return ERR_PTR(err);
1993 int ip6_route_add(struct fib6_config *cfg)
1995 struct mx6_config mxc = { .mx = NULL, };
1996 struct rt6_info *rt;
1999 rt = ip6_route_info_create(cfg);
2006 err = ip6_convert_metrics(&mxc, cfg);
2010 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2022 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2025 struct fib6_table *table;
2026 struct net *net = dev_net(rt->dst.dev);
2028 if (rt == net->ipv6.ip6_null_entry ||
2029 rt->dst.flags & DST_NOCACHE) {
2034 table = rt->rt6i_table;
2035 write_lock_bh(&table->tb6_lock);
2036 err = fib6_del(rt, info);
2037 write_unlock_bh(&table->tb6_lock);
2044 int ip6_del_rt(struct rt6_info *rt)
2046 struct nl_info info = {
2047 .nl_net = dev_net(rt->dst.dev),
2049 return __ip6_del_rt(rt, &info);
2052 static int ip6_route_del(struct fib6_config *cfg)
2054 struct fib6_table *table;
2055 struct fib6_node *fn;
2056 struct rt6_info *rt;
2059 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2063 read_lock_bh(&table->tb6_lock);
2065 fn = fib6_locate(&table->tb6_root,
2066 &cfg->fc_dst, cfg->fc_dst_len,
2067 &cfg->fc_src, cfg->fc_src_len);
2070 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2071 if ((rt->rt6i_flags & RTF_CACHE) &&
2072 !(cfg->fc_flags & RTF_CACHE))
2074 if (cfg->fc_ifindex &&
2076 rt->dst.dev->ifindex != cfg->fc_ifindex))
2078 if (cfg->fc_flags & RTF_GATEWAY &&
2079 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2081 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2084 read_unlock_bh(&table->tb6_lock);
2086 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2089 read_unlock_bh(&table->tb6_lock);
2094 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2096 struct netevent_redirect netevent;
2097 struct rt6_info *rt, *nrt = NULL;
2098 struct ndisc_options ndopts;
2099 struct inet6_dev *in6_dev;
2100 struct neighbour *neigh;
2102 int optlen, on_link;
2105 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2106 optlen -= sizeof(*msg);
2109 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2113 msg = (struct rd_msg *)icmp6_hdr(skb);
2115 if (ipv6_addr_is_multicast(&msg->dest)) {
2116 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2121 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2123 } else if (ipv6_addr_type(&msg->target) !=
2124 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2125 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2129 in6_dev = __in6_dev_get(skb->dev);
2132 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2136 * The IP source address of the Redirect MUST be the same as the current
2137 * first-hop router for the specified ICMP Destination Address.
2140 if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2141 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2146 if (ndopts.nd_opts_tgt_lladdr) {
2147 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2150 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2155 rt = (struct rt6_info *) dst;
2156 if (rt->rt6i_flags & RTF_REJECT) {
2157 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2161 /* Redirect received -> path was valid.
2162 * Look, redirects are sent only in response to data packets,
2163 * so that this nexthop apparently is reachable. --ANK
2165 dst_confirm(&rt->dst);
2167 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2172 * We have finally decided to accept it.
2175 neigh_update(neigh, lladdr, NUD_STALE,
2176 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2177 NEIGH_UPDATE_F_OVERRIDE|
2178 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2179 NEIGH_UPDATE_F_ISROUTER))
2182 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2186 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2188 nrt->rt6i_flags &= ~RTF_GATEWAY;
2190 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2192 if (ip6_ins_rt(nrt))
2195 netevent.old = &rt->dst;
2196 netevent.new = &nrt->dst;
2197 netevent.daddr = &msg->dest;
2198 netevent.neigh = neigh;
2199 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2201 if (rt->rt6i_flags & RTF_CACHE) {
2202 rt = (struct rt6_info *) dst_clone(&rt->dst);
2207 neigh_release(neigh);
2211 * Misc support functions
2214 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2216 BUG_ON(from->dst.from);
2218 rt->rt6i_flags &= ~RTF_EXPIRES;
2219 dst_hold(&from->dst);
2220 rt->dst.from = &from->dst;
2221 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2224 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2226 rt->dst.input = ort->dst.input;
2227 rt->dst.output = ort->dst.output;
2228 rt->rt6i_dst = ort->rt6i_dst;
2229 rt->dst.error = ort->dst.error;
2230 rt->rt6i_idev = ort->rt6i_idev;
2232 in6_dev_hold(rt->rt6i_idev);
2233 rt->dst.lastuse = jiffies;
2234 rt->rt6i_gateway = ort->rt6i_gateway;
2235 rt->rt6i_flags = ort->rt6i_flags;
2236 rt6_set_from(rt, ort);
2237 rt->rt6i_metric = ort->rt6i_metric;
2238 #ifdef CONFIG_IPV6_SUBTREES
2239 rt->rt6i_src = ort->rt6i_src;
2241 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2242 rt->rt6i_table = ort->rt6i_table;
2243 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2246 #ifdef CONFIG_IPV6_ROUTE_INFO
2247 static struct rt6_info *rt6_get_route_info(struct net_device *dev,
2248 const struct in6_addr *prefix, int prefixlen,
2249 const struct in6_addr *gwaddr)
2251 struct fib6_node *fn;
2252 struct rt6_info *rt = NULL;
2253 struct fib6_table *table;
2255 table = fib6_get_table(dev_net(dev),
2256 addrconf_rt_table(dev, RT6_TABLE_INFO));
2260 read_lock_bh(&table->tb6_lock);
2261 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2265 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2266 if (rt->dst.dev->ifindex != dev->ifindex)
2268 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2270 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2276 read_unlock_bh(&table->tb6_lock);
2280 static struct rt6_info *rt6_add_route_info(struct net_device *dev,
2281 const struct in6_addr *prefix, int prefixlen,
2282 const struct in6_addr *gwaddr, unsigned int pref)
2284 struct fib6_config cfg = {
2285 .fc_metric = IP6_RT_PRIO_USER,
2286 .fc_ifindex = dev->ifindex,
2287 .fc_dst_len = prefixlen,
2288 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2289 RTF_UP | RTF_PREF(pref),
2290 .fc_nlinfo.portid = 0,
2291 .fc_nlinfo.nlh = NULL,
2292 .fc_nlinfo.nl_net = dev_net(dev),
2295 cfg.fc_table = l3mdev_fib_table_by_index(dev_net(dev), dev->ifindex) ? : addrconf_rt_table(dev, RT6_TABLE_INFO);
2296 cfg.fc_dst = *prefix;
2297 cfg.fc_gateway = *gwaddr;
2299 /* We should treat it as a default route if prefix length is 0. */
2301 cfg.fc_flags |= RTF_DEFAULT;
2303 ip6_route_add(&cfg);
2305 return rt6_get_route_info(dev, prefix, prefixlen, gwaddr);
2309 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2311 struct rt6_info *rt;
2312 struct fib6_table *table;
2314 table = fib6_get_table(dev_net(dev),
2315 addrconf_rt_table(dev, RT6_TABLE_MAIN));
2319 read_lock_bh(&table->tb6_lock);
2320 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2321 if (dev == rt->dst.dev &&
2322 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2323 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2328 read_unlock_bh(&table->tb6_lock);
2332 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2333 struct net_device *dev,
2336 struct fib6_config cfg = {
2337 .fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_DFLT),
2338 .fc_metric = IP6_RT_PRIO_USER,
2339 .fc_ifindex = dev->ifindex,
2340 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2341 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2342 .fc_nlinfo.portid = 0,
2343 .fc_nlinfo.nlh = NULL,
2344 .fc_nlinfo.nl_net = dev_net(dev),
2347 cfg.fc_gateway = *gwaddr;
2349 ip6_route_add(&cfg);
2351 return rt6_get_dflt_router(gwaddr, dev);
2355 int rt6_addrconf_purge(struct rt6_info *rt, void *arg) {
2356 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2357 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2))
2362 void rt6_purge_dflt_routers(struct net *net)
2364 fib6_clean_all(net, rt6_addrconf_purge, NULL);
2367 static void rtmsg_to_fib6_config(struct net *net,
2368 struct in6_rtmsg *rtmsg,
2369 struct fib6_config *cfg)
2371 memset(cfg, 0, sizeof(*cfg));
2373 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2375 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2376 cfg->fc_metric = rtmsg->rtmsg_metric;
2377 cfg->fc_expires = rtmsg->rtmsg_info;
2378 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2379 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2380 cfg->fc_flags = rtmsg->rtmsg_flags;
2382 cfg->fc_nlinfo.nl_net = net;
2384 cfg->fc_dst = rtmsg->rtmsg_dst;
2385 cfg->fc_src = rtmsg->rtmsg_src;
2386 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2389 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2391 struct fib6_config cfg;
2392 struct in6_rtmsg rtmsg;
2396 case SIOCADDRT: /* Add a route */
2397 case SIOCDELRT: /* Delete a route */
2398 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2400 err = copy_from_user(&rtmsg, arg,
2401 sizeof(struct in6_rtmsg));
2405 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2410 err = ip6_route_add(&cfg);
2413 err = ip6_route_del(&cfg);
2427 * Drop the packet on the floor
2430 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2433 struct dst_entry *dst = skb_dst(skb);
2434 switch (ipstats_mib_noroutes) {
2435 case IPSTATS_MIB_INNOROUTES:
2436 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2437 if (type == IPV6_ADDR_ANY) {
2438 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2439 IPSTATS_MIB_INADDRERRORS);
2443 case IPSTATS_MIB_OUTNOROUTES:
2444 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2445 ipstats_mib_noroutes);
2448 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2453 static int ip6_pkt_discard(struct sk_buff *skb)
2455 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2458 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2460 skb->dev = skb_dst(skb)->dev;
2461 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2464 static int ip6_pkt_prohibit(struct sk_buff *skb)
2466 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2469 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2471 skb->dev = skb_dst(skb)->dev;
2472 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2476 * Allocate a dst for local (unicast / anycast) address.
2479 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2480 const struct in6_addr *addr,
2484 struct net *net = dev_net(idev->dev);
2485 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2488 return ERR_PTR(-ENOMEM);
2492 rt->dst.flags |= DST_HOST;
2493 rt->dst.input = ip6_input;
2494 rt->dst.output = ip6_output;
2495 rt->rt6i_idev = idev;
2497 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2499 rt->rt6i_flags |= RTF_ANYCAST;
2501 rt->rt6i_flags |= RTF_LOCAL;
2503 rt->rt6i_gateway = *addr;
2504 rt->rt6i_dst.addr = *addr;
2505 rt->rt6i_dst.plen = 128;
2506 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2507 rt->rt6i_table = fib6_get_table(net, tb_id);
2508 rt->dst.flags |= DST_NOCACHE;
2510 atomic_set(&rt->dst.__refcnt, 1);
2515 int ip6_route_get_saddr(struct net *net,
2516 struct rt6_info *rt,
2517 const struct in6_addr *daddr,
2519 struct in6_addr *saddr)
2521 struct inet6_dev *idev =
2522 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2524 if (rt && rt->rt6i_prefsrc.plen)
2525 *saddr = rt->rt6i_prefsrc.addr;
2527 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2528 daddr, prefs, saddr);
2532 /* remove deleted ip from prefsrc entries */
2533 struct arg_dev_net_ip {
2534 struct net_device *dev;
2536 struct in6_addr *addr;
2539 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2541 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2542 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2543 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2545 if (((void *)rt->dst.dev == dev || !dev) &&
2546 rt != net->ipv6.ip6_null_entry &&
2547 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2548 /* remove prefsrc entry */
2549 rt->rt6i_prefsrc.plen = 0;
2554 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2556 struct net *net = dev_net(ifp->idev->dev);
2557 struct arg_dev_net_ip adni = {
2558 .dev = ifp->idev->dev,
2562 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2565 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2566 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2568 /* Remove routers and update dst entries when gateway turn into host. */
2569 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2571 struct in6_addr *gateway = (struct in6_addr *)arg;
2573 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2574 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2575 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2581 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2583 fib6_clean_all(net, fib6_clean_tohost, gateway);
2586 struct arg_dev_net {
2587 struct net_device *dev;
2591 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2593 const struct arg_dev_net *adn = arg;
2594 const struct net_device *dev = adn->dev;
2596 if ((rt->dst.dev == dev || !dev) &&
2597 rt != adn->net->ipv6.ip6_null_entry)
2603 void rt6_ifdown(struct net *net, struct net_device *dev)
2605 struct arg_dev_net adn = {
2610 fib6_clean_all(net, fib6_ifdown, &adn);
2611 icmp6_clean_all(fib6_ifdown, &adn);
2613 rt6_uncached_list_flush_dev(net, dev);
2616 struct rt6_mtu_change_arg {
2617 struct net_device *dev;
2621 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2623 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2624 struct inet6_dev *idev;
2626 /* In IPv6 pmtu discovery is not optional,
2627 so that RTAX_MTU lock cannot disable it.
2628 We still use this lock to block changes
2629 caused by addrconf/ndisc.
2632 idev = __in6_dev_get(arg->dev);
2636 /* For administrative MTU increase, there is no way to discover
2637 IPv6 PMTU increase, so PMTU increase should be updated here.
2638 Since RFC 1981 doesn't include administrative MTU increase
2639 update PMTU increase is a MUST. (i.e. jumbo frame)
2642 If new MTU is less than route PMTU, this new MTU will be the
2643 lowest MTU in the path, update the route PMTU to reflect PMTU
2644 decreases; if new MTU is greater than route PMTU, and the
2645 old MTU is the lowest MTU in the path, update the route PMTU
2646 to reflect the increase. In this case if the other nodes' MTU
2647 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2650 if (rt->dst.dev == arg->dev &&
2651 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2652 if (rt->rt6i_flags & RTF_CACHE) {
2653 /* For RTF_CACHE with rt6i_pmtu == 0
2654 * (i.e. a redirected route),
2655 * the metrics of its rt->dst.from has already
2658 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2659 rt->rt6i_pmtu = arg->mtu;
2660 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2661 (dst_mtu(&rt->dst) < arg->mtu &&
2662 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2663 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2669 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2671 struct rt6_mtu_change_arg arg = {
2676 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2679 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2680 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2681 [RTA_OIF] = { .type = NLA_U32 },
2682 [RTA_IIF] = { .type = NLA_U32 },
2683 [RTA_PRIORITY] = { .type = NLA_U32 },
2684 [RTA_METRICS] = { .type = NLA_NESTED },
2685 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2686 [RTA_PREF] = { .type = NLA_U8 },
2687 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2688 [RTA_ENCAP] = { .type = NLA_NESTED },
2689 [RTA_UID] = { .type = NLA_U32 },
2692 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2693 struct fib6_config *cfg)
2696 struct nlattr *tb[RTA_MAX+1];
2700 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2705 rtm = nlmsg_data(nlh);
2706 memset(cfg, 0, sizeof(*cfg));
2708 cfg->fc_table = rtm->rtm_table;
2709 cfg->fc_dst_len = rtm->rtm_dst_len;
2710 cfg->fc_src_len = rtm->rtm_src_len;
2711 cfg->fc_flags = RTF_UP;
2712 cfg->fc_protocol = rtm->rtm_protocol;
2713 cfg->fc_type = rtm->rtm_type;
2715 if (rtm->rtm_type == RTN_UNREACHABLE ||
2716 rtm->rtm_type == RTN_BLACKHOLE ||
2717 rtm->rtm_type == RTN_PROHIBIT ||
2718 rtm->rtm_type == RTN_THROW)
2719 cfg->fc_flags |= RTF_REJECT;
2721 if (rtm->rtm_type == RTN_LOCAL)
2722 cfg->fc_flags |= RTF_LOCAL;
2724 if (rtm->rtm_flags & RTM_F_CLONED)
2725 cfg->fc_flags |= RTF_CACHE;
2727 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2728 cfg->fc_nlinfo.nlh = nlh;
2729 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2731 if (tb[RTA_GATEWAY]) {
2732 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2733 cfg->fc_flags |= RTF_GATEWAY;
2737 int plen = (rtm->rtm_dst_len + 7) >> 3;
2739 if (nla_len(tb[RTA_DST]) < plen)
2742 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2746 int plen = (rtm->rtm_src_len + 7) >> 3;
2748 if (nla_len(tb[RTA_SRC]) < plen)
2751 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2754 if (tb[RTA_PREFSRC])
2755 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2758 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2760 if (tb[RTA_PRIORITY])
2761 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2763 if (tb[RTA_METRICS]) {
2764 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2765 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2769 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2771 if (tb[RTA_MULTIPATH]) {
2772 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2773 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2777 pref = nla_get_u8(tb[RTA_PREF]);
2778 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2779 pref != ICMPV6_ROUTER_PREF_HIGH)
2780 pref = ICMPV6_ROUTER_PREF_MEDIUM;
2781 cfg->fc_flags |= RTF_PREF(pref);
2785 cfg->fc_encap = tb[RTA_ENCAP];
2787 if (tb[RTA_ENCAP_TYPE])
2788 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2796 struct rt6_info *rt6_info;
2797 struct fib6_config r_cfg;
2798 struct mx6_config mxc;
2799 struct list_head next;
2802 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2806 list_for_each_entry(nh, rt6_nh_list, next) {
2807 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2808 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2809 nh->r_cfg.fc_ifindex);
2813 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2814 struct rt6_info *rt, struct fib6_config *r_cfg)
2817 struct rt6_info *rtnh;
2820 list_for_each_entry(nh, rt6_nh_list, next) {
2821 /* check if rt6_info already exists */
2822 rtnh = nh->rt6_info;
2824 if (rtnh->dst.dev == rt->dst.dev &&
2825 rtnh->rt6i_idev == rt->rt6i_idev &&
2826 ipv6_addr_equal(&rtnh->rt6i_gateway,
2831 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2835 err = ip6_convert_metrics(&nh->mxc, r_cfg);
2840 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2841 list_add_tail(&nh->next, rt6_nh_list);
2846 static int ip6_route_multipath_add(struct fib6_config *cfg)
2848 struct fib6_config r_cfg;
2849 struct rtnexthop *rtnh;
2850 struct rt6_info *rt;
2851 struct rt6_nh *err_nh;
2852 struct rt6_nh *nh, *nh_safe;
2857 int replace = (cfg->fc_nlinfo.nlh &&
2858 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2859 LIST_HEAD(rt6_nh_list);
2861 remaining = cfg->fc_mp_len;
2862 rtnh = (struct rtnexthop *)cfg->fc_mp;
2864 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
2865 * rt6_info structs per nexthop
2867 while (rtnh_ok(rtnh, remaining)) {
2868 memcpy(&r_cfg, cfg, sizeof(*cfg));
2869 if (rtnh->rtnh_ifindex)
2870 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2872 attrlen = rtnh_attrlen(rtnh);
2874 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2876 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2878 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2879 r_cfg.fc_flags |= RTF_GATEWAY;
2881 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2882 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2884 r_cfg.fc_encap_type = nla_get_u16(nla);
2887 rt = ip6_route_info_create(&r_cfg);
2894 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
2900 rtnh = rtnh_next(rtnh, &remaining);
2904 list_for_each_entry(nh, &rt6_nh_list, next) {
2905 err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
2906 /* nh->rt6_info is used or freed at this point, reset to NULL*/
2907 nh->rt6_info = NULL;
2910 ip6_print_replace_route_err(&rt6_nh_list);
2915 /* Because each route is added like a single route we remove
2916 * these flags after the first nexthop: if there is a collision,
2917 * we have already failed to add the first nexthop:
2918 * fib6_add_rt2node() has rejected it; when replacing, old
2919 * nexthops have been replaced by first new, the rest should
2922 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2930 /* Delete routes that were already added */
2931 list_for_each_entry(nh, &rt6_nh_list, next) {
2934 ip6_route_del(&nh->r_cfg);
2938 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
2940 dst_free(&nh->rt6_info->dst);
2942 list_del(&nh->next);
2949 static int ip6_route_multipath_del(struct fib6_config *cfg)
2951 struct fib6_config r_cfg;
2952 struct rtnexthop *rtnh;
2955 int err = 1, last_err = 0;
2957 remaining = cfg->fc_mp_len;
2958 rtnh = (struct rtnexthop *)cfg->fc_mp;
2960 /* Parse a Multipath Entry */
2961 while (rtnh_ok(rtnh, remaining)) {
2962 memcpy(&r_cfg, cfg, sizeof(*cfg));
2963 if (rtnh->rtnh_ifindex)
2964 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2966 attrlen = rtnh_attrlen(rtnh);
2968 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2970 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2972 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
2973 r_cfg.fc_flags |= RTF_GATEWAY;
2976 err = ip6_route_del(&r_cfg);
2980 rtnh = rtnh_next(rtnh, &remaining);
2986 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2988 struct fib6_config cfg;
2991 err = rtm_to_fib6_config(skb, nlh, &cfg);
2996 return ip6_route_multipath_del(&cfg);
2998 return ip6_route_del(&cfg);
3001 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3003 struct fib6_config cfg;
3006 err = rtm_to_fib6_config(skb, nlh, &cfg);
3011 return ip6_route_multipath_add(&cfg);
3013 return ip6_route_add(&cfg);
3016 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3018 return NLMSG_ALIGN(sizeof(struct rtmsg))
3019 + nla_total_size(16) /* RTA_SRC */
3020 + nla_total_size(16) /* RTA_DST */
3021 + nla_total_size(16) /* RTA_GATEWAY */
3022 + nla_total_size(16) /* RTA_PREFSRC */
3023 + nla_total_size(4) /* RTA_TABLE */
3024 + nla_total_size(4) /* RTA_IIF */
3025 + nla_total_size(4) /* RTA_OIF */
3026 + nla_total_size(4) /* RTA_PRIORITY */
3027 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3028 + nla_total_size(sizeof(struct rta_cacheinfo))
3029 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3030 + nla_total_size(1) /* RTA_PREF */
3031 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3034 static int rt6_fill_node(struct net *net,
3035 struct sk_buff *skb, struct rt6_info *rt,
3036 struct in6_addr *dst, struct in6_addr *src,
3037 int iif, int type, u32 portid, u32 seq,
3038 int prefix, int nowait, unsigned int flags)
3040 u32 metrics[RTAX_MAX];
3042 struct nlmsghdr *nlh;
3046 if (prefix) { /* user wants prefix routes only */
3047 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3048 /* success since this is not a prefix route */
3053 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3057 rtm = nlmsg_data(nlh);
3058 rtm->rtm_family = AF_INET6;
3059 rtm->rtm_dst_len = rt->rt6i_dst.plen;
3060 rtm->rtm_src_len = rt->rt6i_src.plen;
3063 table = rt->rt6i_table->tb6_id;
3065 table = RT6_TABLE_UNSPEC;
3066 rtm->rtm_table = table;
3067 if (nla_put_u32(skb, RTA_TABLE, table))
3068 goto nla_put_failure;
3069 if (rt->rt6i_flags & RTF_REJECT) {
3070 switch (rt->dst.error) {
3072 rtm->rtm_type = RTN_BLACKHOLE;
3075 rtm->rtm_type = RTN_PROHIBIT;
3078 rtm->rtm_type = RTN_THROW;
3081 rtm->rtm_type = RTN_UNREACHABLE;
3085 else if (rt->rt6i_flags & RTF_LOCAL)
3086 rtm->rtm_type = RTN_LOCAL;
3087 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3088 rtm->rtm_type = RTN_LOCAL;
3090 rtm->rtm_type = RTN_UNICAST;
3092 if (!netif_carrier_ok(rt->dst.dev)) {
3093 rtm->rtm_flags |= RTNH_F_LINKDOWN;
3094 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3095 rtm->rtm_flags |= RTNH_F_DEAD;
3097 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3098 rtm->rtm_protocol = rt->rt6i_protocol;
3099 if (rt->rt6i_flags & RTF_DYNAMIC)
3100 rtm->rtm_protocol = RTPROT_REDIRECT;
3101 else if (rt->rt6i_flags & RTF_ADDRCONF) {
3102 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3103 rtm->rtm_protocol = RTPROT_RA;
3105 rtm->rtm_protocol = RTPROT_KERNEL;
3108 if (rt->rt6i_flags & RTF_CACHE)
3109 rtm->rtm_flags |= RTM_F_CLONED;
3112 if (nla_put_in6_addr(skb, RTA_DST, dst))
3113 goto nla_put_failure;
3114 rtm->rtm_dst_len = 128;
3115 } else if (rtm->rtm_dst_len)
3116 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3117 goto nla_put_failure;
3118 #ifdef CONFIG_IPV6_SUBTREES
3120 if (nla_put_in6_addr(skb, RTA_SRC, src))
3121 goto nla_put_failure;
3122 rtm->rtm_src_len = 128;
3123 } else if (rtm->rtm_src_len &&
3124 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3125 goto nla_put_failure;
3128 #ifdef CONFIG_IPV6_MROUTE
3129 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3130 int err = ip6mr_get_route(net, skb, rtm, nowait);
3135 goto nla_put_failure;
3137 if (err == -EMSGSIZE)
3138 goto nla_put_failure;
3143 if (nla_put_u32(skb, RTA_IIF, iif))
3144 goto nla_put_failure;
3146 struct in6_addr saddr_buf;
3147 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3148 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3149 goto nla_put_failure;
3152 if (rt->rt6i_prefsrc.plen) {
3153 struct in6_addr saddr_buf;
3154 saddr_buf = rt->rt6i_prefsrc.addr;
3155 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3156 goto nla_put_failure;
3159 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3161 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3162 if (rtnetlink_put_metrics(skb, metrics) < 0)
3163 goto nla_put_failure;
3165 if (rt->rt6i_flags & RTF_GATEWAY) {
3166 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3167 goto nla_put_failure;
3171 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3172 goto nla_put_failure;
3173 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3174 goto nla_put_failure;
3176 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3178 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3179 goto nla_put_failure;
3181 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3182 goto nla_put_failure;
3184 lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3186 nlmsg_end(skb, nlh);
3190 nlmsg_cancel(skb, nlh);
3194 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3196 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3199 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3200 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3201 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3205 return rt6_fill_node(arg->net,
3206 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3207 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3208 prefix, 0, NLM_F_MULTI);
3211 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3213 struct net *net = sock_net(in_skb->sk);
3214 struct nlattr *tb[RTA_MAX+1];
3215 struct rt6_info *rt;
3216 struct sk_buff *skb;
3219 int err, iif = 0, oif = 0;
3221 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3226 memset(&fl6, 0, sizeof(fl6));
3229 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3232 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3236 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3239 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3243 iif = nla_get_u32(tb[RTA_IIF]);
3246 oif = nla_get_u32(tb[RTA_OIF]);
3249 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3252 fl6.flowi6_uid = make_kuid(current_user_ns(),
3253 nla_get_u32(tb[RTA_UID]));
3255 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3257 struct net_device *dev;
3260 dev = __dev_get_by_index(net, iif);
3266 fl6.flowi6_iif = iif;
3268 if (!ipv6_addr_any(&fl6.saddr))
3269 flags |= RT6_LOOKUP_F_HAS_SADDR;
3271 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3274 fl6.flowi6_oif = oif;
3276 if (netif_index_is_l3_master(net, oif)) {
3277 fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
3278 FLOWI_FLAG_SKIP_NH_OIF;
3281 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3284 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3291 /* Reserve room for dummy headers, this skb can pass
3292 through good chunk of routing engine.
3294 skb_reset_mac_header(skb);
3295 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3297 skb_dst_set(skb, &rt->dst);
3299 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3300 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3301 nlh->nlmsg_seq, 0, 0, 0);
3307 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3312 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3313 unsigned int nlm_flags)
3315 struct sk_buff *skb;
3316 struct net *net = info->nl_net;
3321 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3323 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3327 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3328 event, info->portid, seq, 0, 0, nlm_flags);
3330 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3331 WARN_ON(err == -EMSGSIZE);
3335 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3336 info->nlh, gfp_any());
3340 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3343 static int ip6_route_dev_notify(struct notifier_block *this,
3344 unsigned long event, void *ptr)
3346 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3347 struct net *net = dev_net(dev);
3349 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3350 net->ipv6.ip6_null_entry->dst.dev = dev;
3351 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3352 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3353 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3354 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3355 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3356 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3367 #ifdef CONFIG_PROC_FS
3369 static const struct file_operations ipv6_route_proc_fops = {
3370 .owner = THIS_MODULE,
3371 .open = ipv6_route_open,
3373 .llseek = seq_lseek,
3374 .release = seq_release_net,
3377 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3379 struct net *net = (struct net *)seq->private;
3380 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3381 net->ipv6.rt6_stats->fib_nodes,
3382 net->ipv6.rt6_stats->fib_route_nodes,
3383 net->ipv6.rt6_stats->fib_rt_alloc,
3384 net->ipv6.rt6_stats->fib_rt_entries,
3385 net->ipv6.rt6_stats->fib_rt_cache,
3386 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3387 net->ipv6.rt6_stats->fib_discarded_routes);
3392 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3394 return single_open_net(inode, file, rt6_stats_seq_show);
3397 static const struct file_operations rt6_stats_seq_fops = {
3398 .owner = THIS_MODULE,
3399 .open = rt6_stats_seq_open,
3401 .llseek = seq_lseek,
3402 .release = single_release_net,
3404 #endif /* CONFIG_PROC_FS */
3406 #ifdef CONFIG_SYSCTL
3409 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3410 void __user *buffer, size_t *lenp, loff_t *ppos)
3417 net = (struct net *)ctl->extra1;
3418 delay = net->ipv6.sysctl.flush_delay;
3419 proc_dointvec(ctl, write, buffer, lenp, ppos);
3420 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3424 struct ctl_table ipv6_route_table_template[] = {
3426 .procname = "flush",
3427 .data = &init_net.ipv6.sysctl.flush_delay,
3428 .maxlen = sizeof(int),
3430 .proc_handler = ipv6_sysctl_rtcache_flush
3433 .procname = "gc_thresh",
3434 .data = &ip6_dst_ops_template.gc_thresh,
3435 .maxlen = sizeof(int),
3437 .proc_handler = proc_dointvec,
3440 .procname = "max_size",
3441 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3442 .maxlen = sizeof(int),
3444 .proc_handler = proc_dointvec,
3447 .procname = "gc_min_interval",
3448 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3449 .maxlen = sizeof(int),
3451 .proc_handler = proc_dointvec_jiffies,
3454 .procname = "gc_timeout",
3455 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3456 .maxlen = sizeof(int),
3458 .proc_handler = proc_dointvec_jiffies,
3461 .procname = "gc_interval",
3462 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3463 .maxlen = sizeof(int),
3465 .proc_handler = proc_dointvec_jiffies,
3468 .procname = "gc_elasticity",
3469 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3470 .maxlen = sizeof(int),
3472 .proc_handler = proc_dointvec,
3475 .procname = "mtu_expires",
3476 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3477 .maxlen = sizeof(int),
3479 .proc_handler = proc_dointvec_jiffies,
3482 .procname = "min_adv_mss",
3483 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3484 .maxlen = sizeof(int),
3486 .proc_handler = proc_dointvec,
3489 .procname = "gc_min_interval_ms",
3490 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3491 .maxlen = sizeof(int),
3493 .proc_handler = proc_dointvec_ms_jiffies,
3498 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3500 struct ctl_table *table;
3502 table = kmemdup(ipv6_route_table_template,
3503 sizeof(ipv6_route_table_template),
3507 table[0].data = &net->ipv6.sysctl.flush_delay;
3508 table[0].extra1 = net;
3509 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3510 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3511 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3512 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3513 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3514 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3515 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3516 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3517 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3519 /* Don't export sysctls to unprivileged users */
3520 if (net->user_ns != &init_user_ns)
3521 table[0].procname = NULL;
3528 static int __net_init ip6_route_net_init(struct net *net)
3532 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3533 sizeof(net->ipv6.ip6_dst_ops));
3535 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3536 goto out_ip6_dst_ops;
3538 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3539 sizeof(*net->ipv6.ip6_null_entry),
3541 if (!net->ipv6.ip6_null_entry)
3542 goto out_ip6_dst_entries;
3543 net->ipv6.ip6_null_entry->dst.path =
3544 (struct dst_entry *)net->ipv6.ip6_null_entry;
3545 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3546 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3547 ip6_template_metrics, true);
3549 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3550 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3551 sizeof(*net->ipv6.ip6_prohibit_entry),
3553 if (!net->ipv6.ip6_prohibit_entry)
3554 goto out_ip6_null_entry;
3555 net->ipv6.ip6_prohibit_entry->dst.path =
3556 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3557 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3558 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3559 ip6_template_metrics, true);
3561 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3562 sizeof(*net->ipv6.ip6_blk_hole_entry),
3564 if (!net->ipv6.ip6_blk_hole_entry)
3565 goto out_ip6_prohibit_entry;
3566 net->ipv6.ip6_blk_hole_entry->dst.path =
3567 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3568 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3569 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3570 ip6_template_metrics, true);
3573 net->ipv6.sysctl.flush_delay = 0;
3574 net->ipv6.sysctl.ip6_rt_max_size = 4096;
3575 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3576 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3577 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3578 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3579 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3580 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3582 net->ipv6.ip6_rt_gc_expire = 30*HZ;
3588 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3589 out_ip6_prohibit_entry:
3590 kfree(net->ipv6.ip6_prohibit_entry);
3592 kfree(net->ipv6.ip6_null_entry);
3594 out_ip6_dst_entries:
3595 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3600 static void __net_exit ip6_route_net_exit(struct net *net)
3602 kfree(net->ipv6.ip6_null_entry);
3603 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3604 kfree(net->ipv6.ip6_prohibit_entry);
3605 kfree(net->ipv6.ip6_blk_hole_entry);
3607 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3610 static int __net_init ip6_route_net_init_late(struct net *net)
3612 #ifdef CONFIG_PROC_FS
3613 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3614 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3619 static void __net_exit ip6_route_net_exit_late(struct net *net)
3621 #ifdef CONFIG_PROC_FS
3622 remove_proc_entry("ipv6_route", net->proc_net);
3623 remove_proc_entry("rt6_stats", net->proc_net);
3627 static struct pernet_operations ip6_route_net_ops = {
3628 .init = ip6_route_net_init,
3629 .exit = ip6_route_net_exit,
3632 static int __net_init ipv6_inetpeer_init(struct net *net)
3634 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3638 inet_peer_base_init(bp);
3639 net->ipv6.peers = bp;
3643 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3645 struct inet_peer_base *bp = net->ipv6.peers;
3647 net->ipv6.peers = NULL;
3648 inetpeer_invalidate_tree(bp);
3652 static struct pernet_operations ipv6_inetpeer_ops = {
3653 .init = ipv6_inetpeer_init,
3654 .exit = ipv6_inetpeer_exit,
3657 static struct pernet_operations ip6_route_net_late_ops = {
3658 .init = ip6_route_net_init_late,
3659 .exit = ip6_route_net_exit_late,
3662 static struct notifier_block ip6_route_dev_notifier = {
3663 .notifier_call = ip6_route_dev_notify,
3667 int __init ip6_route_init(void)
3673 ip6_dst_ops_template.kmem_cachep =
3674 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3675 SLAB_HWCACHE_ALIGN, NULL);
3676 if (!ip6_dst_ops_template.kmem_cachep)
3679 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3681 goto out_kmem_cache;
3683 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3685 goto out_dst_entries;
3687 ret = register_pernet_subsys(&ip6_route_net_ops);
3689 goto out_register_inetpeer;
3691 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3693 /* Registering of the loopback is done before this portion of code,
3694 * the loopback reference in rt6_info will not be taken, do it
3695 * manually for init_net */
3696 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3697 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3698 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3699 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3700 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3701 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3702 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3706 goto out_register_subsys;
3712 ret = fib6_rules_init();
3716 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3718 goto fib6_rules_init;
3721 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3722 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3723 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3724 goto out_register_late_subsys;
3726 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3728 goto out_register_late_subsys;
3730 for_each_possible_cpu(cpu) {
3731 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3733 INIT_LIST_HEAD(&ul->head);
3734 spin_lock_init(&ul->lock);
3740 out_register_late_subsys:
3741 unregister_pernet_subsys(&ip6_route_net_late_ops);
3743 fib6_rules_cleanup();
3748 out_register_subsys:
3749 unregister_pernet_subsys(&ip6_route_net_ops);
3750 out_register_inetpeer:
3751 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3753 dst_entries_destroy(&ip6_dst_blackhole_ops);
3755 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3759 void ip6_route_cleanup(void)
3761 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3762 unregister_pernet_subsys(&ip6_route_net_late_ops);
3763 fib6_rules_cleanup();
3766 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3767 unregister_pernet_subsys(&ip6_route_net_ops);
3768 dst_entries_destroy(&ip6_dst_blackhole_ops);
3769 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);