2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61 #include <net/lwtunnel.h>
63 #include <asm/uaccess.h>
66 #include <linux/sysctl.h>
70 RT6_NUD_FAIL_HARD = -3,
71 RT6_NUD_FAIL_PROBE = -2,
72 RT6_NUD_FAIL_DO_RR = -1,
76 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int ip6_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void ip6_dst_destroy(struct dst_entry *);
82 static void ip6_dst_ifdown(struct dst_entry *,
83 struct net_device *dev, int how);
84 static int ip6_dst_gc(struct dst_ops *ops);
86 static int ip6_pkt_discard(struct sk_buff *skb);
87 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
88 static int ip6_pkt_prohibit(struct sk_buff *skb);
89 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
90 static void ip6_link_failure(struct sk_buff *skb);
91 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
92 struct sk_buff *skb, u32 mtu);
93 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
95 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
96 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
98 #ifdef CONFIG_IPV6_ROUTE_INFO
99 static struct rt6_info *rt6_add_route_info(struct net *net,
100 const struct in6_addr *prefix, int prefixlen,
101 const struct in6_addr *gwaddr, int ifindex,
103 static struct rt6_info *rt6_get_route_info(struct net *net,
104 const struct in6_addr *prefix, int prefixlen,
105 const struct in6_addr *gwaddr, int ifindex);
108 struct uncached_list {
110 struct list_head head;
113 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
115 static void rt6_uncached_list_add(struct rt6_info *rt)
117 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
119 rt->dst.flags |= DST_NOCACHE;
120 rt->rt6i_uncached_list = ul;
122 spin_lock_bh(&ul->lock);
123 list_add_tail(&rt->rt6i_uncached, &ul->head);
124 spin_unlock_bh(&ul->lock);
127 static void rt6_uncached_list_del(struct rt6_info *rt)
129 if (!list_empty(&rt->rt6i_uncached)) {
130 struct uncached_list *ul = rt->rt6i_uncached_list;
132 spin_lock_bh(&ul->lock);
133 list_del(&rt->rt6i_uncached);
134 spin_unlock_bh(&ul->lock);
138 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
140 struct net_device *loopback_dev = net->loopback_dev;
143 for_each_possible_cpu(cpu) {
144 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
147 spin_lock_bh(&ul->lock);
148 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
149 struct inet6_dev *rt_idev = rt->rt6i_idev;
150 struct net_device *rt_dev = rt->dst.dev;
152 if (rt_idev && (rt_idev->dev == dev || !dev) &&
153 rt_idev->dev != loopback_dev) {
154 rt->rt6i_idev = in6_dev_get(loopback_dev);
155 in6_dev_put(rt_idev);
158 if (rt_dev && (rt_dev == dev || !dev) &&
159 rt_dev != loopback_dev) {
160 rt->dst.dev = loopback_dev;
161 dev_hold(rt->dst.dev);
165 spin_unlock_bh(&ul->lock);
169 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
171 return dst_metrics_write_ptr(rt->dst.from);
174 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
176 struct rt6_info *rt = (struct rt6_info *)dst;
178 if (rt->rt6i_flags & RTF_PCPU)
179 return rt6_pcpu_cow_metrics(rt);
180 else if (rt->rt6i_flags & RTF_CACHE)
183 return dst_cow_metrics_generic(dst, old);
186 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
190 struct in6_addr *p = &rt->rt6i_gateway;
192 if (!ipv6_addr_any(p))
193 return (const void *) p;
195 return &ipv6_hdr(skb)->daddr;
199 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
203 struct rt6_info *rt = (struct rt6_info *) dst;
206 daddr = choose_neigh_daddr(rt, skb, daddr);
207 n = __ipv6_neigh_lookup(dst->dev, daddr);
210 return neigh_create(&nd_tbl, daddr, dst->dev);
213 static struct dst_ops ip6_dst_ops_template = {
217 .check = ip6_dst_check,
218 .default_advmss = ip6_default_advmss,
220 .cow_metrics = ipv6_cow_metrics,
221 .destroy = ip6_dst_destroy,
222 .ifdown = ip6_dst_ifdown,
223 .negative_advice = ip6_negative_advice,
224 .link_failure = ip6_link_failure,
225 .update_pmtu = ip6_rt_update_pmtu,
226 .redirect = rt6_do_redirect,
227 .local_out = __ip6_local_out,
228 .neigh_lookup = ip6_neigh_lookup,
231 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
233 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
235 return mtu ? : dst->dev->mtu;
238 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
239 struct sk_buff *skb, u32 mtu)
243 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
248 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
254 static struct dst_ops ip6_dst_blackhole_ops = {
256 .destroy = ip6_dst_destroy,
257 .check = ip6_dst_check,
258 .mtu = ip6_blackhole_mtu,
259 .default_advmss = ip6_default_advmss,
260 .update_pmtu = ip6_rt_blackhole_update_pmtu,
261 .redirect = ip6_rt_blackhole_redirect,
262 .cow_metrics = ip6_rt_blackhole_cow_metrics,
263 .neigh_lookup = ip6_neigh_lookup,
266 static const u32 ip6_template_metrics[RTAX_MAX] = {
267 [RTAX_HOPLIMIT - 1] = 0,
270 static const struct rt6_info ip6_null_entry_template = {
272 .__refcnt = ATOMIC_INIT(1),
274 .obsolete = DST_OBSOLETE_FORCE_CHK,
275 .error = -ENETUNREACH,
276 .input = ip6_pkt_discard,
277 .output = ip6_pkt_discard_out,
279 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
280 .rt6i_protocol = RTPROT_KERNEL,
281 .rt6i_metric = ~(u32) 0,
282 .rt6i_ref = ATOMIC_INIT(1),
285 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
287 static const struct rt6_info ip6_prohibit_entry_template = {
289 .__refcnt = ATOMIC_INIT(1),
291 .obsolete = DST_OBSOLETE_FORCE_CHK,
293 .input = ip6_pkt_prohibit,
294 .output = ip6_pkt_prohibit_out,
296 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
297 .rt6i_protocol = RTPROT_KERNEL,
298 .rt6i_metric = ~(u32) 0,
299 .rt6i_ref = ATOMIC_INIT(1),
302 static const struct rt6_info ip6_blk_hole_entry_template = {
304 .__refcnt = ATOMIC_INIT(1),
306 .obsolete = DST_OBSOLETE_FORCE_CHK,
308 .input = dst_discard,
309 .output = dst_discard_sk,
311 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
312 .rt6i_protocol = RTPROT_KERNEL,
313 .rt6i_metric = ~(u32) 0,
314 .rt6i_ref = ATOMIC_INIT(1),
319 /* allocate dst with ip6_dst_ops */
320 static struct rt6_info *__ip6_dst_alloc(struct net *net,
321 struct net_device *dev,
323 struct fib6_table *table)
325 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
326 0, DST_OBSOLETE_FORCE_CHK, flags);
329 struct dst_entry *dst = &rt->dst;
331 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
332 INIT_LIST_HEAD(&rt->rt6i_siblings);
333 INIT_LIST_HEAD(&rt->rt6i_uncached);
338 static struct rt6_info *ip6_dst_alloc(struct net *net,
339 struct net_device *dev,
341 struct fib6_table *table)
343 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
346 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
350 for_each_possible_cpu(cpu) {
353 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
354 /* no one shares rt */
358 dst_destroy((struct dst_entry *)rt);
366 static void ip6_dst_destroy(struct dst_entry *dst)
368 struct rt6_info *rt = (struct rt6_info *)dst;
369 struct dst_entry *from = dst->from;
370 struct inet6_dev *idev;
372 dst_destroy_metrics_generic(dst);
373 free_percpu(rt->rt6i_pcpu);
374 rt6_uncached_list_del(rt);
376 idev = rt->rt6i_idev;
378 rt->rt6i_idev = NULL;
386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
389 struct rt6_info *rt = (struct rt6_info *)dst;
390 struct inet6_dev *idev = rt->rt6i_idev;
391 struct net_device *loopback_dev =
392 dev_net(dev)->loopback_dev;
394 if (dev != loopback_dev) {
395 if (idev && idev->dev == dev) {
396 struct inet6_dev *loopback_idev =
397 in6_dev_get(loopback_dev);
399 rt->rt6i_idev = loopback_idev;
406 static bool rt6_check_expired(const struct rt6_info *rt)
408 if (rt->rt6i_flags & RTF_EXPIRES) {
409 if (time_after(jiffies, rt->dst.expires))
411 } else if (rt->dst.from) {
412 return rt6_check_expired((struct rt6_info *) rt->dst.from);
417 /* Multipath route selection:
418 * Hash based function using packet header and flowlabel.
419 * Adapted from fib_info_hashfn()
421 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
422 const struct flowi6 *fl6)
424 unsigned int val = fl6->flowi6_proto;
426 val ^= ipv6_addr_hash(&fl6->daddr);
427 val ^= ipv6_addr_hash(&fl6->saddr);
429 /* Work only if this not encapsulated */
430 switch (fl6->flowi6_proto) {
434 val ^= (__force u16)fl6->fl6_sport;
435 val ^= (__force u16)fl6->fl6_dport;
439 val ^= (__force u16)fl6->fl6_icmp_type;
440 val ^= (__force u16)fl6->fl6_icmp_code;
443 /* RFC6438 recommands to use flowlabel */
444 val ^= (__force u32)fl6->flowlabel;
446 /* Perhaps, we need to tune, this function? */
447 val = val ^ (val >> 7) ^ (val >> 12);
448 return val % candidate_count;
451 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
452 struct flowi6 *fl6, int oif,
455 struct rt6_info *sibling, *next_sibling;
458 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
459 /* Don't change the route, if route_choosen == 0
460 * (siblings does not include ourself)
463 list_for_each_entry_safe(sibling, next_sibling,
464 &match->rt6i_siblings, rt6i_siblings) {
466 if (route_choosen == 0) {
467 if (rt6_score_route(sibling, oif, strict) < 0)
477 * Route lookup. Any table->tb6_lock is implied.
480 static inline struct rt6_info *rt6_device_match(struct net *net,
482 const struct in6_addr *saddr,
486 struct rt6_info *local = NULL;
487 struct rt6_info *sprt;
489 if (!oif && ipv6_addr_any(saddr))
492 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
493 struct net_device *dev = sprt->dst.dev;
496 if (dev->ifindex == oif)
498 if (dev->flags & IFF_LOOPBACK) {
499 if (!sprt->rt6i_idev ||
500 sprt->rt6i_idev->dev->ifindex != oif) {
501 if (flags & RT6_LOOKUP_F_IFACE && oif)
503 if (local && (!oif ||
504 local->rt6i_idev->dev->ifindex == oif))
510 if (ipv6_chk_addr(net, saddr, dev,
511 flags & RT6_LOOKUP_F_IFACE))
520 if (flags & RT6_LOOKUP_F_IFACE)
521 return net->ipv6.ip6_null_entry;
527 #ifdef CONFIG_IPV6_ROUTER_PREF
528 struct __rt6_probe_work {
529 struct work_struct work;
530 struct in6_addr target;
531 struct net_device *dev;
534 static void rt6_probe_deferred(struct work_struct *w)
536 struct in6_addr mcaddr;
537 struct __rt6_probe_work *work =
538 container_of(w, struct __rt6_probe_work, work);
540 addrconf_addr_solict_mult(&work->target, &mcaddr);
541 ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
546 static void rt6_probe(struct rt6_info *rt)
548 struct __rt6_probe_work *work;
549 struct neighbour *neigh;
551 * Okay, this does not seem to be appropriate
552 * for now, however, we need to check if it
553 * is really so; aka Router Reachability Probing.
555 * Router Reachability Probe MUST be rate-limited
556 * to no more than one per minute.
558 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
561 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
563 if (neigh->nud_state & NUD_VALID)
567 write_lock(&neigh->lock);
568 if (!(neigh->nud_state & NUD_VALID) &&
571 rt->rt6i_idev->cnf.rtr_probe_interval)) {
572 work = kmalloc(sizeof(*work), GFP_ATOMIC);
574 __neigh_set_probe_once(neigh);
576 write_unlock(&neigh->lock);
578 work = kmalloc(sizeof(*work), GFP_ATOMIC);
582 INIT_WORK(&work->work, rt6_probe_deferred);
583 work->target = rt->rt6i_gateway;
584 dev_hold(rt->dst.dev);
585 work->dev = rt->dst.dev;
586 schedule_work(&work->work);
590 rcu_read_unlock_bh();
593 static inline void rt6_probe(struct rt6_info *rt)
599 * Default Router Selection (RFC 2461 6.3.6)
601 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
603 struct net_device *dev = rt->dst.dev;
604 if (!oif || dev->ifindex == oif)
606 if ((dev->flags & IFF_LOOPBACK) &&
607 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
612 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
614 struct neighbour *neigh;
615 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
617 if (rt->rt6i_flags & RTF_NONEXTHOP ||
618 !(rt->rt6i_flags & RTF_GATEWAY))
619 return RT6_NUD_SUCCEED;
622 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
624 read_lock(&neigh->lock);
625 if (neigh->nud_state & NUD_VALID)
626 ret = RT6_NUD_SUCCEED;
627 #ifdef CONFIG_IPV6_ROUTER_PREF
628 else if (!(neigh->nud_state & NUD_FAILED))
629 ret = RT6_NUD_SUCCEED;
631 ret = RT6_NUD_FAIL_PROBE;
633 read_unlock(&neigh->lock);
635 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
636 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
638 rcu_read_unlock_bh();
643 static int rt6_score_route(struct rt6_info *rt, int oif,
648 m = rt6_check_dev(rt, oif);
649 if (!m && (strict & RT6_LOOKUP_F_IFACE))
650 return RT6_NUD_FAIL_HARD;
651 #ifdef CONFIG_IPV6_ROUTER_PREF
652 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
654 if (strict & RT6_LOOKUP_F_REACHABLE) {
655 int n = rt6_check_neigh(rt);
662 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
663 int *mpri, struct rt6_info *match,
667 bool match_do_rr = false;
668 struct inet6_dev *idev = rt->rt6i_idev;
669 struct net_device *dev = rt->dst.dev;
671 if (dev && !netif_carrier_ok(dev) &&
672 idev->cnf.ignore_routes_with_linkdown)
675 if (rt6_check_expired(rt))
678 m = rt6_score_route(rt, oif, strict);
679 if (m == RT6_NUD_FAIL_DO_RR) {
681 m = 0; /* lowest valid score */
682 } else if (m == RT6_NUD_FAIL_HARD) {
686 if (strict & RT6_LOOKUP_F_REACHABLE)
689 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
691 *do_rr = match_do_rr;
699 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
700 struct rt6_info *rr_head,
701 u32 metric, int oif, int strict,
704 struct rt6_info *rt, *match, *cont;
709 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
710 if (rt->rt6i_metric != metric) {
715 match = find_match(rt, oif, strict, &mpri, match, do_rr);
718 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
719 if (rt->rt6i_metric != metric) {
724 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730 for (rt = cont; rt; rt = rt->dst.rt6_next)
731 match = find_match(rt, oif, strict, &mpri, match, do_rr);
736 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
738 struct rt6_info *match, *rt0;
744 fn->rr_ptr = rt0 = fn->leaf;
746 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
750 struct rt6_info *next = rt0->dst.rt6_next;
752 /* no entries matched; do round-robin */
753 if (!next || next->rt6i_metric != rt0->rt6i_metric)
760 net = dev_net(rt0->dst.dev);
761 return match ? match : net->ipv6.ip6_null_entry;
764 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
766 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
769 #ifdef CONFIG_IPV6_ROUTE_INFO
770 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
771 const struct in6_addr *gwaddr)
773 struct net *net = dev_net(dev);
774 struct route_info *rinfo = (struct route_info *) opt;
775 struct in6_addr prefix_buf, *prefix;
777 unsigned long lifetime;
780 if (len < sizeof(struct route_info)) {
784 /* Sanity check for prefix_len and length */
785 if (rinfo->length > 3) {
787 } else if (rinfo->prefix_len > 128) {
789 } else if (rinfo->prefix_len > 64) {
790 if (rinfo->length < 2) {
793 } else if (rinfo->prefix_len > 0) {
794 if (rinfo->length < 1) {
799 pref = rinfo->route_pref;
800 if (pref == ICMPV6_ROUTER_PREF_INVALID)
803 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
805 if (rinfo->length == 3)
806 prefix = (struct in6_addr *)rinfo->prefix;
808 /* this function is safe */
809 ipv6_addr_prefix(&prefix_buf,
810 (struct in6_addr *)rinfo->prefix,
812 prefix = &prefix_buf;
815 if (rinfo->prefix_len == 0)
816 rt = rt6_get_dflt_router(gwaddr, dev);
818 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
819 gwaddr, dev->ifindex);
821 if (rt && !lifetime) {
827 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
830 rt->rt6i_flags = RTF_ROUTEINFO |
831 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
834 if (!addrconf_finite_timeout(lifetime))
835 rt6_clean_expires(rt);
837 rt6_set_expires(rt, jiffies + HZ * lifetime);
845 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
846 struct in6_addr *saddr)
848 struct fib6_node *pn;
850 if (fn->fn_flags & RTN_TL_ROOT)
853 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
854 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
857 if (fn->fn_flags & RTN_RTINFO)
862 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
863 struct fib6_table *table,
864 struct flowi6 *fl6, int flags)
866 struct fib6_node *fn;
869 read_lock_bh(&table->tb6_lock);
870 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
873 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
874 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
875 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
876 if (rt == net->ipv6.ip6_null_entry) {
877 fn = fib6_backtrack(fn, &fl6->saddr);
881 dst_use(&rt->dst, jiffies);
882 read_unlock_bh(&table->tb6_lock);
887 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
890 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
892 EXPORT_SYMBOL_GPL(ip6_route_lookup);
894 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
895 const struct in6_addr *saddr, int oif, int strict)
897 struct flowi6 fl6 = {
901 struct dst_entry *dst;
902 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
905 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
906 flags |= RT6_LOOKUP_F_HAS_SADDR;
909 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
911 return (struct rt6_info *) dst;
917 EXPORT_SYMBOL(rt6_lookup);
919 /* ip6_ins_rt is called with FREE table->tb6_lock.
920 It takes new route entry, the addition fails by any reason the
921 route is freed. In any case, if caller does not hold it, it may
925 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
926 struct mx6_config *mxc)
929 struct fib6_table *table;
931 table = rt->rt6i_table;
932 write_lock_bh(&table->tb6_lock);
933 err = fib6_add(&table->tb6_root, rt, info, mxc);
934 write_unlock_bh(&table->tb6_lock);
939 int ip6_ins_rt(struct rt6_info *rt)
941 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
942 struct mx6_config mxc = { .mx = NULL, };
944 return __ip6_ins_rt(rt, &info, &mxc);
947 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
948 const struct in6_addr *daddr,
949 const struct in6_addr *saddr)
957 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
958 ort = (struct rt6_info *)ort->dst.from;
960 rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
966 ip6_rt_copy_init(rt, ort);
967 rt->rt6i_flags |= RTF_CACHE;
969 rt->dst.flags |= DST_HOST;
970 rt->rt6i_dst.addr = *daddr;
971 rt->rt6i_dst.plen = 128;
973 if (!rt6_is_gw_or_nonexthop(ort)) {
974 if (ort->rt6i_dst.plen != 128 &&
975 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
976 rt->rt6i_flags |= RTF_ANYCAST;
977 #ifdef CONFIG_IPV6_SUBTREES
978 if (rt->rt6i_src.plen && saddr) {
979 rt->rt6i_src.addr = *saddr;
980 rt->rt6i_src.plen = 128;
988 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
990 struct rt6_info *pcpu_rt;
992 pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
993 rt->dst.dev, rt->dst.flags,
998 ip6_rt_copy_init(pcpu_rt, rt);
999 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1000 pcpu_rt->rt6i_flags |= RTF_PCPU;
1004 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1005 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1007 struct rt6_info *pcpu_rt, *prev, **p;
1009 p = this_cpu_ptr(rt->rt6i_pcpu);
1015 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1017 struct net *net = dev_net(rt->dst.dev);
1019 pcpu_rt = net->ipv6.ip6_null_entry;
1023 prev = cmpxchg(p, NULL, pcpu_rt);
1025 /* If someone did it before us, return prev instead */
1026 dst_destroy(&pcpu_rt->dst);
1031 dst_hold(&pcpu_rt->dst);
1032 rt6_dst_from_metrics_check(pcpu_rt);
1036 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1037 struct flowi6 *fl6, int flags)
1039 struct fib6_node *fn, *saved_fn;
1040 struct rt6_info *rt;
1043 strict |= flags & RT6_LOOKUP_F_IFACE;
1044 if (net->ipv6.devconf_all->forwarding == 0)
1045 strict |= RT6_LOOKUP_F_REACHABLE;
1047 read_lock_bh(&table->tb6_lock);
1049 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1053 rt = rt6_select(fn, oif, strict);
1054 if (rt->rt6i_nsiblings)
1055 rt = rt6_multipath_select(rt, fl6, oif, strict);
1056 if (rt == net->ipv6.ip6_null_entry) {
1057 fn = fib6_backtrack(fn, &fl6->saddr);
1059 goto redo_rt6_select;
1060 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1061 /* also consider unreachable route */
1062 strict &= ~RT6_LOOKUP_F_REACHABLE;
1064 goto redo_rt6_select;
1069 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1070 dst_use(&rt->dst, jiffies);
1071 read_unlock_bh(&table->tb6_lock);
1073 rt6_dst_from_metrics_check(rt);
1075 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1076 !(rt->rt6i_flags & RTF_GATEWAY))) {
1077 /* Create a RTF_CACHE clone which will not be
1078 * owned by the fib6 tree. It is for the special case where
1079 * the daddr in the skb during the neighbor look-up is different
1080 * from the fl6->daddr used to look-up route here.
1083 struct rt6_info *uncached_rt;
1085 dst_use(&rt->dst, jiffies);
1086 read_unlock_bh(&table->tb6_lock);
1088 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1089 dst_release(&rt->dst);
1092 rt6_uncached_list_add(uncached_rt);
1094 uncached_rt = net->ipv6.ip6_null_entry;
1096 dst_hold(&uncached_rt->dst);
1100 /* Get a percpu copy */
1102 struct rt6_info *pcpu_rt;
1104 rt->dst.lastuse = jiffies;
1106 pcpu_rt = rt6_get_pcpu_route(rt);
1107 read_unlock_bh(&table->tb6_lock);
1113 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1114 struct flowi6 *fl6, int flags)
1116 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1119 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1120 struct net_device *dev,
1121 struct flowi6 *fl6, int flags)
1123 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1124 flags |= RT6_LOOKUP_F_IFACE;
1126 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1129 void ip6_route_input(struct sk_buff *skb)
1131 const struct ipv6hdr *iph = ipv6_hdr(skb);
1132 struct net *net = dev_net(skb->dev);
1133 int flags = RT6_LOOKUP_F_HAS_SADDR;
1134 struct flowi6 fl6 = {
1135 .flowi6_iif = skb->dev->ifindex,
1136 .daddr = iph->daddr,
1137 .saddr = iph->saddr,
1138 .flowlabel = ip6_flowinfo(iph),
1139 .flowi6_mark = skb->mark,
1140 .flowi6_proto = iph->nexthdr,
1144 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1147 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1148 struct flowi6 *fl6, int flags)
1150 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1153 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1158 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1160 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1161 flags |= RT6_LOOKUP_F_IFACE;
1163 if (!ipv6_addr_any(&fl6->saddr))
1164 flags |= RT6_LOOKUP_F_HAS_SADDR;
1166 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1168 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1170 EXPORT_SYMBOL(ip6_route_output);
1172 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1174 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1175 struct dst_entry *new = NULL;
1177 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1181 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1184 new->input = dst_discard;
1185 new->output = dst_discard_sk;
1187 if (dst_metrics_read_only(&ort->dst))
1188 new->_metrics = ort->dst._metrics;
1190 dst_copy_metrics(new, &ort->dst);
1191 rt->rt6i_idev = ort->rt6i_idev;
1193 in6_dev_hold(rt->rt6i_idev);
1195 rt->rt6i_gateway = ort->rt6i_gateway;
1196 rt->rt6i_flags = ort->rt6i_flags;
1197 rt->rt6i_metric = 0;
1199 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1200 #ifdef CONFIG_IPV6_SUBTREES
1201 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1207 dst_release(dst_orig);
1208 return new ? new : ERR_PTR(-ENOMEM);
1212 * Destination cache support functions
1215 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1218 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1219 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1222 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1224 if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1227 if (rt6_check_expired(rt))
1233 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1235 if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1236 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1242 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1244 struct rt6_info *rt;
1246 rt = (struct rt6_info *) dst;
1248 /* All IPV6 dsts are created with ->obsolete set to the value
1249 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1250 * into this function always.
1253 rt6_dst_from_metrics_check(rt);
1255 if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1256 return rt6_dst_from_check(rt, cookie);
1258 return rt6_check(rt, cookie);
1261 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1263 struct rt6_info *rt = (struct rt6_info *) dst;
1266 if (rt->rt6i_flags & RTF_CACHE) {
1267 if (rt6_check_expired(rt)) {
1279 static void ip6_link_failure(struct sk_buff *skb)
1281 struct rt6_info *rt;
1283 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1285 rt = (struct rt6_info *) skb_dst(skb);
1287 if (rt->rt6i_flags & RTF_CACHE) {
1291 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1292 rt->rt6i_node->fn_sernum = -1;
1297 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1299 struct net *net = dev_net(rt->dst.dev);
1301 rt->rt6i_flags |= RTF_MODIFIED;
1302 rt->rt6i_pmtu = mtu;
1303 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1306 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1307 const struct ipv6hdr *iph, u32 mtu)
1309 struct rt6_info *rt6 = (struct rt6_info *)dst;
1311 if (rt6->rt6i_flags & RTF_LOCAL)
1315 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1316 if (mtu >= dst_mtu(dst))
1319 if (rt6->rt6i_flags & RTF_CACHE) {
1320 rt6_do_update_pmtu(rt6, mtu);
1322 const struct in6_addr *daddr, *saddr;
1323 struct rt6_info *nrt6;
1326 daddr = &iph->daddr;
1327 saddr = &iph->saddr;
1329 daddr = &sk->sk_v6_daddr;
1330 saddr = &inet6_sk(sk)->saddr;
1334 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1336 rt6_do_update_pmtu(nrt6, mtu);
1338 /* ip6_ins_rt(nrt6) will bump the
1339 * rt6->rt6i_node->fn_sernum
1340 * which will fail the next rt6_check() and
1341 * invalidate the sk->sk_dst_cache.
1348 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1349 struct sk_buff *skb, u32 mtu)
1351 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1354 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1357 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1358 struct dst_entry *dst;
1361 memset(&fl6, 0, sizeof(fl6));
1362 fl6.flowi6_oif = oif;
1363 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1364 fl6.daddr = iph->daddr;
1365 fl6.saddr = iph->saddr;
1366 fl6.flowlabel = ip6_flowinfo(iph);
1368 dst = ip6_route_output(net, NULL, &fl6);
1370 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1373 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1375 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1377 ip6_update_pmtu(skb, sock_net(sk), mtu,
1378 sk->sk_bound_dev_if, sk->sk_mark);
1380 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1382 /* Handle redirects */
1383 struct ip6rd_flowi {
1385 struct in6_addr gateway;
1388 static struct rt6_info *__ip6_route_redirect(struct net *net,
1389 struct fib6_table *table,
1393 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1394 struct rt6_info *rt;
1395 struct fib6_node *fn;
1397 /* Get the "current" route for this destination and
1398 * check if the redirect has come from approriate router.
1400 * RFC 4861 specifies that redirects should only be
1401 * accepted if they come from the nexthop to the target.
1402 * Due to the way the routes are chosen, this notion
1403 * is a bit fuzzy and one might need to check all possible
1407 read_lock_bh(&table->tb6_lock);
1408 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1410 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1411 if (rt6_check_expired(rt))
1415 if (!(rt->rt6i_flags & RTF_GATEWAY))
1417 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1419 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1425 rt = net->ipv6.ip6_null_entry;
1426 else if (rt->dst.error) {
1427 rt = net->ipv6.ip6_null_entry;
1431 if (rt == net->ipv6.ip6_null_entry) {
1432 fn = fib6_backtrack(fn, &fl6->saddr);
1440 read_unlock_bh(&table->tb6_lock);
1445 static struct dst_entry *ip6_route_redirect(struct net *net,
1446 const struct flowi6 *fl6,
1447 const struct in6_addr *gateway)
1449 int flags = RT6_LOOKUP_F_HAS_SADDR;
1450 struct ip6rd_flowi rdfl;
1453 rdfl.gateway = *gateway;
1455 return fib6_rule_lookup(net, &rdfl.fl6,
1456 flags, __ip6_route_redirect);
1459 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1461 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1462 struct dst_entry *dst;
1465 memset(&fl6, 0, sizeof(fl6));
1466 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1467 fl6.flowi6_oif = oif;
1468 fl6.flowi6_mark = mark;
1469 fl6.daddr = iph->daddr;
1470 fl6.saddr = iph->saddr;
1471 fl6.flowlabel = ip6_flowinfo(iph);
1473 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1474 rt6_do_redirect(dst, NULL, skb);
1477 EXPORT_SYMBOL_GPL(ip6_redirect);
1479 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1482 const struct ipv6hdr *iph = ipv6_hdr(skb);
1483 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1484 struct dst_entry *dst;
1487 memset(&fl6, 0, sizeof(fl6));
1488 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1489 fl6.flowi6_oif = oif;
1490 fl6.flowi6_mark = mark;
1491 fl6.daddr = msg->dest;
1492 fl6.saddr = iph->daddr;
1494 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1495 rt6_do_redirect(dst, NULL, skb);
1499 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1501 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1503 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1505 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1507 struct net_device *dev = dst->dev;
1508 unsigned int mtu = dst_mtu(dst);
1509 struct net *net = dev_net(dev);
1511 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1513 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1514 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1517 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1518 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1519 * IPV6_MAXPLEN is also valid and means: "any MSS,
1520 * rely only on pmtu discovery"
1522 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1527 static unsigned int ip6_mtu(const struct dst_entry *dst)
1529 const struct rt6_info *rt = (const struct rt6_info *)dst;
1530 unsigned int mtu = rt->rt6i_pmtu;
1531 struct inet6_dev *idev;
1536 mtu = dst_metric_raw(dst, RTAX_MTU);
1543 idev = __in6_dev_get(dst->dev);
1545 mtu = idev->cnf.mtu6;
1549 return min_t(unsigned int, mtu, IP6_MAX_MTU);
1552 static struct dst_entry *icmp6_dst_gc_list;
1553 static DEFINE_SPINLOCK(icmp6_dst_lock);
1555 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1558 struct dst_entry *dst;
1559 struct rt6_info *rt;
1560 struct inet6_dev *idev = in6_dev_get(dev);
1561 struct net *net = dev_net(dev);
1563 if (unlikely(!idev))
1564 return ERR_PTR(-ENODEV);
1566 rt = ip6_dst_alloc(net, dev, 0, NULL);
1567 if (unlikely(!rt)) {
1569 dst = ERR_PTR(-ENOMEM);
1573 rt->dst.flags |= DST_HOST;
1574 rt->dst.output = ip6_output;
1575 atomic_set(&rt->dst.__refcnt, 1);
1576 rt->rt6i_gateway = fl6->daddr;
1577 rt->rt6i_dst.addr = fl6->daddr;
1578 rt->rt6i_dst.plen = 128;
1579 rt->rt6i_idev = idev;
1580 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1582 spin_lock_bh(&icmp6_dst_lock);
1583 rt->dst.next = icmp6_dst_gc_list;
1584 icmp6_dst_gc_list = &rt->dst;
1585 spin_unlock_bh(&icmp6_dst_lock);
1587 fib6_force_start_gc(net);
1589 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1595 int icmp6_dst_gc(void)
1597 struct dst_entry *dst, **pprev;
1600 spin_lock_bh(&icmp6_dst_lock);
1601 pprev = &icmp6_dst_gc_list;
1603 while ((dst = *pprev) != NULL) {
1604 if (!atomic_read(&dst->__refcnt)) {
1613 spin_unlock_bh(&icmp6_dst_lock);
1618 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1621 struct dst_entry *dst, **pprev;
1623 spin_lock_bh(&icmp6_dst_lock);
1624 pprev = &icmp6_dst_gc_list;
1625 while ((dst = *pprev) != NULL) {
1626 struct rt6_info *rt = (struct rt6_info *) dst;
1627 if (func(rt, arg)) {
1634 spin_unlock_bh(&icmp6_dst_lock);
1637 static int ip6_dst_gc(struct dst_ops *ops)
1639 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1640 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1641 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1642 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1643 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1644 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1647 entries = dst_entries_get_fast(ops);
1648 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1649 entries <= rt_max_size)
1652 net->ipv6.ip6_rt_gc_expire++;
1653 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1654 entries = dst_entries_get_slow(ops);
1655 if (entries < ops->gc_thresh)
1656 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1658 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1659 return entries > rt_max_size;
1662 static int ip6_convert_metrics(struct mx6_config *mxc,
1663 const struct fib6_config *cfg)
1672 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1676 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1677 int type = nla_type(nla);
1682 if (unlikely(type > RTAX_MAX))
1684 if (type == RTAX_CC_ALGO) {
1685 char tmp[TCP_CA_NAME_MAX];
1687 nla_strlcpy(tmp, nla, sizeof(tmp));
1688 val = tcp_ca_get_key_by_name(tmp);
1689 if (val == TCP_CA_UNSPEC)
1692 val = nla_get_u32(nla);
1696 __set_bit(type - 1, mxc->mx_valid);
1708 int ip6_route_add(struct fib6_config *cfg)
1711 struct net *net = cfg->fc_nlinfo.nl_net;
1712 struct rt6_info *rt = NULL;
1713 struct net_device *dev = NULL;
1714 struct inet6_dev *idev = NULL;
1715 struct fib6_table *table;
1716 struct mx6_config mxc = { .mx = NULL, };
1719 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1721 #ifndef CONFIG_IPV6_SUBTREES
1722 if (cfg->fc_src_len)
1725 if (cfg->fc_ifindex) {
1727 dev = dev_get_by_index(net, cfg->fc_ifindex);
1730 idev = in6_dev_get(dev);
1735 if (cfg->fc_metric == 0)
1736 cfg->fc_metric = IP6_RT_PRIO_USER;
1739 if (cfg->fc_nlinfo.nlh &&
1740 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1741 table = fib6_get_table(net, cfg->fc_table);
1743 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1744 table = fib6_new_table(net, cfg->fc_table);
1747 table = fib6_new_table(net, cfg->fc_table);
1753 rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1760 if (cfg->fc_flags & RTF_EXPIRES)
1761 rt6_set_expires(rt, jiffies +
1762 clock_t_to_jiffies(cfg->fc_expires));
1764 rt6_clean_expires(rt);
1766 if (cfg->fc_protocol == RTPROT_UNSPEC)
1767 cfg->fc_protocol = RTPROT_BOOT;
1768 rt->rt6i_protocol = cfg->fc_protocol;
1770 addr_type = ipv6_addr_type(&cfg->fc_dst);
1772 if (addr_type & IPV6_ADDR_MULTICAST)
1773 rt->dst.input = ip6_mc_input;
1774 else if (cfg->fc_flags & RTF_LOCAL)
1775 rt->dst.input = ip6_input;
1777 rt->dst.input = ip6_forward;
1779 rt->dst.output = ip6_output;
1781 if (cfg->fc_encap) {
1782 struct lwtunnel_state *lwtstate;
1784 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1785 cfg->fc_encap, &lwtstate);
1788 rt->dst.lwtstate = lwtstate_get(lwtstate);
1789 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1790 rt->dst.lwtstate->orig_output = rt->dst.output;
1791 rt->dst.output = lwtunnel_output;
1793 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1794 rt->dst.lwtstate->orig_input = rt->dst.input;
1795 rt->dst.input = lwtunnel_input;
1799 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1800 rt->rt6i_dst.plen = cfg->fc_dst_len;
1801 if (rt->rt6i_dst.plen == 128)
1802 rt->dst.flags |= DST_HOST;
1804 #ifdef CONFIG_IPV6_SUBTREES
1805 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1806 rt->rt6i_src.plen = cfg->fc_src_len;
1809 rt->rt6i_metric = cfg->fc_metric;
1811 /* We cannot add true routes via loopback here,
1812 they would result in kernel looping; promote them to reject routes
1814 if ((cfg->fc_flags & RTF_REJECT) ||
1815 (dev && (dev->flags & IFF_LOOPBACK) &&
1816 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1817 !(cfg->fc_flags & RTF_LOCAL))) {
1818 /* hold loopback dev/idev if we haven't done so. */
1819 if (dev != net->loopback_dev) {
1824 dev = net->loopback_dev;
1826 idev = in6_dev_get(dev);
1832 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1833 switch (cfg->fc_type) {
1835 rt->dst.error = -EINVAL;
1836 rt->dst.output = dst_discard_sk;
1837 rt->dst.input = dst_discard;
1840 rt->dst.error = -EACCES;
1841 rt->dst.output = ip6_pkt_prohibit_out;
1842 rt->dst.input = ip6_pkt_prohibit;
1846 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1848 rt->dst.output = ip6_pkt_discard_out;
1849 rt->dst.input = ip6_pkt_discard;
1855 if (cfg->fc_flags & RTF_GATEWAY) {
1856 const struct in6_addr *gw_addr;
1859 gw_addr = &cfg->fc_gateway;
1860 gwa_type = ipv6_addr_type(gw_addr);
1862 /* if gw_addr is local we will fail to detect this in case
1863 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1864 * will return already-added prefix route via interface that
1865 * prefix route was assigned to, which might be non-loopback.
1868 if (ipv6_chk_addr_and_flags(net, gw_addr,
1869 gwa_type & IPV6_ADDR_LINKLOCAL ?
1873 rt->rt6i_gateway = *gw_addr;
1875 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1876 struct rt6_info *grt;
1878 /* IPv6 strictly inhibits using not link-local
1879 addresses as nexthop address.
1880 Otherwise, router will not able to send redirects.
1881 It is very good, but in some (rare!) circumstances
1882 (SIT, PtP, NBMA NOARP links) it is handy to allow
1883 some exceptions. --ANK
1885 if (!(gwa_type & IPV6_ADDR_UNICAST))
1888 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1890 err = -EHOSTUNREACH;
1894 if (dev != grt->dst.dev) {
1900 idev = grt->rt6i_idev;
1902 in6_dev_hold(grt->rt6i_idev);
1904 if (!(grt->rt6i_flags & RTF_GATEWAY))
1912 if (!dev || (dev->flags & IFF_LOOPBACK))
1920 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1921 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1925 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1926 rt->rt6i_prefsrc.plen = 128;
1928 rt->rt6i_prefsrc.plen = 0;
1930 rt->rt6i_flags = cfg->fc_flags;
1934 rt->rt6i_idev = idev;
1935 rt->rt6i_table = table;
1937 cfg->fc_nlinfo.nl_net = dev_net(dev);
1939 err = ip6_convert_metrics(&mxc, cfg);
1943 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1957 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1960 struct fib6_table *table;
1961 struct net *net = dev_net(rt->dst.dev);
1963 if (rt == net->ipv6.ip6_null_entry) {
1968 table = rt->rt6i_table;
1969 write_lock_bh(&table->tb6_lock);
1970 err = fib6_del(rt, info);
1971 write_unlock_bh(&table->tb6_lock);
1978 int ip6_del_rt(struct rt6_info *rt)
1980 struct nl_info info = {
1981 .nl_net = dev_net(rt->dst.dev),
1983 return __ip6_del_rt(rt, &info);
1986 static int ip6_route_del(struct fib6_config *cfg)
1988 struct fib6_table *table;
1989 struct fib6_node *fn;
1990 struct rt6_info *rt;
1993 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1997 read_lock_bh(&table->tb6_lock);
1999 fn = fib6_locate(&table->tb6_root,
2000 &cfg->fc_dst, cfg->fc_dst_len,
2001 &cfg->fc_src, cfg->fc_src_len);
2004 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2005 if ((rt->rt6i_flags & RTF_CACHE) &&
2006 !(cfg->fc_flags & RTF_CACHE))
2008 if (cfg->fc_ifindex &&
2010 rt->dst.dev->ifindex != cfg->fc_ifindex))
2012 if (cfg->fc_flags & RTF_GATEWAY &&
2013 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2015 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2018 read_unlock_bh(&table->tb6_lock);
2020 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2023 read_unlock_bh(&table->tb6_lock);
2028 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2030 struct net *net = dev_net(skb->dev);
2031 struct netevent_redirect netevent;
2032 struct rt6_info *rt, *nrt = NULL;
2033 struct ndisc_options ndopts;
2034 struct inet6_dev *in6_dev;
2035 struct neighbour *neigh;
2037 int optlen, on_link;
2040 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2041 optlen -= sizeof(*msg);
2044 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2048 msg = (struct rd_msg *)icmp6_hdr(skb);
2050 if (ipv6_addr_is_multicast(&msg->dest)) {
2051 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2056 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2058 } else if (ipv6_addr_type(&msg->target) !=
2059 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2060 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2064 in6_dev = __in6_dev_get(skb->dev);
2067 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2071 * The IP source address of the Redirect MUST be the same as the current
2072 * first-hop router for the specified ICMP Destination Address.
2075 if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2076 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2081 if (ndopts.nd_opts_tgt_lladdr) {
2082 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2085 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2090 rt = (struct rt6_info *) dst;
2091 if (rt == net->ipv6.ip6_null_entry) {
2092 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2096 /* Redirect received -> path was valid.
2097 * Look, redirects are sent only in response to data packets,
2098 * so that this nexthop apparently is reachable. --ANK
2100 dst_confirm(&rt->dst);
2102 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2107 * We have finally decided to accept it.
2110 neigh_update(neigh, lladdr, NUD_STALE,
2111 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2112 NEIGH_UPDATE_F_OVERRIDE|
2113 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2114 NEIGH_UPDATE_F_ISROUTER))
2117 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2121 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2123 nrt->rt6i_flags &= ~RTF_GATEWAY;
2125 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2127 if (ip6_ins_rt(nrt))
2130 netevent.old = &rt->dst;
2131 netevent.new = &nrt->dst;
2132 netevent.daddr = &msg->dest;
2133 netevent.neigh = neigh;
2134 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2136 if (rt->rt6i_flags & RTF_CACHE) {
2137 rt = (struct rt6_info *) dst_clone(&rt->dst);
2142 neigh_release(neigh);
2146 * Misc support functions
2149 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2151 BUG_ON(from->dst.from);
2153 rt->rt6i_flags &= ~RTF_EXPIRES;
2154 dst_hold(&from->dst);
2155 rt->dst.from = &from->dst;
2156 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2159 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2161 rt->dst.input = ort->dst.input;
2162 rt->dst.output = ort->dst.output;
2163 rt->rt6i_dst = ort->rt6i_dst;
2164 rt->dst.error = ort->dst.error;
2165 rt->rt6i_idev = ort->rt6i_idev;
2167 in6_dev_hold(rt->rt6i_idev);
2168 rt->dst.lastuse = jiffies;
2169 rt->rt6i_gateway = ort->rt6i_gateway;
2170 rt->rt6i_flags = ort->rt6i_flags;
2171 rt6_set_from(rt, ort);
2172 rt->rt6i_metric = ort->rt6i_metric;
2173 #ifdef CONFIG_IPV6_SUBTREES
2174 rt->rt6i_src = ort->rt6i_src;
2176 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2177 rt->rt6i_table = ort->rt6i_table;
2178 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2181 #ifdef CONFIG_IPV6_ROUTE_INFO
2182 static struct rt6_info *rt6_get_route_info(struct net *net,
2183 const struct in6_addr *prefix, int prefixlen,
2184 const struct in6_addr *gwaddr, int ifindex)
2186 struct fib6_node *fn;
2187 struct rt6_info *rt = NULL;
2188 struct fib6_table *table;
2190 table = fib6_get_table(net, RT6_TABLE_INFO);
2194 read_lock_bh(&table->tb6_lock);
2195 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2199 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2200 if (rt->dst.dev->ifindex != ifindex)
2202 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2204 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2210 read_unlock_bh(&table->tb6_lock);
2214 static struct rt6_info *rt6_add_route_info(struct net *net,
2215 const struct in6_addr *prefix, int prefixlen,
2216 const struct in6_addr *gwaddr, int ifindex,
2219 struct fib6_config cfg = {
2220 .fc_table = RT6_TABLE_INFO,
2221 .fc_metric = IP6_RT_PRIO_USER,
2222 .fc_ifindex = ifindex,
2223 .fc_dst_len = prefixlen,
2224 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2225 RTF_UP | RTF_PREF(pref),
2226 .fc_nlinfo.portid = 0,
2227 .fc_nlinfo.nlh = NULL,
2228 .fc_nlinfo.nl_net = net,
2231 cfg.fc_dst = *prefix;
2232 cfg.fc_gateway = *gwaddr;
2234 /* We should treat it as a default route if prefix length is 0. */
2236 cfg.fc_flags |= RTF_DEFAULT;
2238 ip6_route_add(&cfg);
2240 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2244 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2246 struct rt6_info *rt;
2247 struct fib6_table *table;
2249 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2253 read_lock_bh(&table->tb6_lock);
2254 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2255 if (dev == rt->dst.dev &&
2256 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2257 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2262 read_unlock_bh(&table->tb6_lock);
2266 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2267 struct net_device *dev,
2270 struct fib6_config cfg = {
2271 .fc_table = RT6_TABLE_DFLT,
2272 .fc_metric = IP6_RT_PRIO_USER,
2273 .fc_ifindex = dev->ifindex,
2274 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2275 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2276 .fc_nlinfo.portid = 0,
2277 .fc_nlinfo.nlh = NULL,
2278 .fc_nlinfo.nl_net = dev_net(dev),
2281 cfg.fc_gateway = *gwaddr;
2283 ip6_route_add(&cfg);
2285 return rt6_get_dflt_router(gwaddr, dev);
2288 void rt6_purge_dflt_routers(struct net *net)
2290 struct rt6_info *rt;
2291 struct fib6_table *table;
2293 /* NOTE: Keep consistent with rt6_get_dflt_router */
2294 table = fib6_get_table(net, RT6_TABLE_DFLT);
2299 read_lock_bh(&table->tb6_lock);
2300 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2301 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2302 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2304 read_unlock_bh(&table->tb6_lock);
2309 read_unlock_bh(&table->tb6_lock);
2312 static void rtmsg_to_fib6_config(struct net *net,
2313 struct in6_rtmsg *rtmsg,
2314 struct fib6_config *cfg)
2316 memset(cfg, 0, sizeof(*cfg));
2318 cfg->fc_table = RT6_TABLE_MAIN;
2319 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2320 cfg->fc_metric = rtmsg->rtmsg_metric;
2321 cfg->fc_expires = rtmsg->rtmsg_info;
2322 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2323 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2324 cfg->fc_flags = rtmsg->rtmsg_flags;
2326 cfg->fc_nlinfo.nl_net = net;
2328 cfg->fc_dst = rtmsg->rtmsg_dst;
2329 cfg->fc_src = rtmsg->rtmsg_src;
2330 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2333 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2335 struct fib6_config cfg;
2336 struct in6_rtmsg rtmsg;
2340 case SIOCADDRT: /* Add a route */
2341 case SIOCDELRT: /* Delete a route */
2342 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2344 err = copy_from_user(&rtmsg, arg,
2345 sizeof(struct in6_rtmsg));
2349 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2354 err = ip6_route_add(&cfg);
2357 err = ip6_route_del(&cfg);
2371 * Drop the packet on the floor
2374 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2377 struct dst_entry *dst = skb_dst(skb);
2378 switch (ipstats_mib_noroutes) {
2379 case IPSTATS_MIB_INNOROUTES:
2380 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2381 if (type == IPV6_ADDR_ANY) {
2382 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2383 IPSTATS_MIB_INADDRERRORS);
2387 case IPSTATS_MIB_OUTNOROUTES:
2388 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2389 ipstats_mib_noroutes);
2392 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2397 static int ip6_pkt_discard(struct sk_buff *skb)
2399 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2402 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2404 skb->dev = skb_dst(skb)->dev;
2405 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2408 static int ip6_pkt_prohibit(struct sk_buff *skb)
2410 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2413 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2415 skb->dev = skb_dst(skb)->dev;
2416 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2420 * Allocate a dst for local (unicast / anycast) address.
2423 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2424 const struct in6_addr *addr,
2427 struct net *net = dev_net(idev->dev);
2428 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2431 return ERR_PTR(-ENOMEM);
2435 rt->dst.flags |= DST_HOST;
2436 rt->dst.input = ip6_input;
2437 rt->dst.output = ip6_output;
2438 rt->rt6i_idev = idev;
2440 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2442 rt->rt6i_flags |= RTF_ANYCAST;
2444 rt->rt6i_flags |= RTF_LOCAL;
2446 rt->rt6i_gateway = *addr;
2447 rt->rt6i_dst.addr = *addr;
2448 rt->rt6i_dst.plen = 128;
2449 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2451 atomic_set(&rt->dst.__refcnt, 1);
2456 int ip6_route_get_saddr(struct net *net,
2457 struct rt6_info *rt,
2458 const struct in6_addr *daddr,
2460 struct in6_addr *saddr)
2462 struct inet6_dev *idev =
2463 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2465 if (rt && rt->rt6i_prefsrc.plen)
2466 *saddr = rt->rt6i_prefsrc.addr;
2468 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2469 daddr, prefs, saddr);
2473 /* remove deleted ip from prefsrc entries */
2474 struct arg_dev_net_ip {
2475 struct net_device *dev;
2477 struct in6_addr *addr;
2480 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2482 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2483 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2484 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2486 if (((void *)rt->dst.dev == dev || !dev) &&
2487 rt != net->ipv6.ip6_null_entry &&
2488 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2489 /* remove prefsrc entry */
2490 rt->rt6i_prefsrc.plen = 0;
2495 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2497 struct net *net = dev_net(ifp->idev->dev);
2498 struct arg_dev_net_ip adni = {
2499 .dev = ifp->idev->dev,
2503 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2506 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2507 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2509 /* Remove routers and update dst entries when gateway turn into host. */
2510 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2512 struct in6_addr *gateway = (struct in6_addr *)arg;
2514 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2515 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2516 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2522 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2524 fib6_clean_all(net, fib6_clean_tohost, gateway);
2527 struct arg_dev_net {
2528 struct net_device *dev;
2532 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2534 const struct arg_dev_net *adn = arg;
2535 const struct net_device *dev = adn->dev;
2537 if ((rt->dst.dev == dev || !dev) &&
2538 rt != adn->net->ipv6.ip6_null_entry)
2544 void rt6_ifdown(struct net *net, struct net_device *dev)
2546 struct arg_dev_net adn = {
2551 fib6_clean_all(net, fib6_ifdown, &adn);
2552 icmp6_clean_all(fib6_ifdown, &adn);
2553 rt6_uncached_list_flush_dev(net, dev);
2556 struct rt6_mtu_change_arg {
2557 struct net_device *dev;
2561 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2563 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2564 struct inet6_dev *idev;
2566 /* In IPv6 pmtu discovery is not optional,
2567 so that RTAX_MTU lock cannot disable it.
2568 We still use this lock to block changes
2569 caused by addrconf/ndisc.
2572 idev = __in6_dev_get(arg->dev);
2576 /* For administrative MTU increase, there is no way to discover
2577 IPv6 PMTU increase, so PMTU increase should be updated here.
2578 Since RFC 1981 doesn't include administrative MTU increase
2579 update PMTU increase is a MUST. (i.e. jumbo frame)
2582 If new MTU is less than route PMTU, this new MTU will be the
2583 lowest MTU in the path, update the route PMTU to reflect PMTU
2584 decreases; if new MTU is greater than route PMTU, and the
2585 old MTU is the lowest MTU in the path, update the route PMTU
2586 to reflect the increase. In this case if the other nodes' MTU
2587 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2590 if (rt->dst.dev == arg->dev &&
2591 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2592 if (rt->rt6i_flags & RTF_CACHE) {
2593 /* For RTF_CACHE with rt6i_pmtu == 0
2594 * (i.e. a redirected route),
2595 * the metrics of its rt->dst.from has already
2598 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2599 rt->rt6i_pmtu = arg->mtu;
2600 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2601 (dst_mtu(&rt->dst) < arg->mtu &&
2602 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2603 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2609 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2611 struct rt6_mtu_change_arg arg = {
2616 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2619 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2620 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2621 [RTA_OIF] = { .type = NLA_U32 },
2622 [RTA_IIF] = { .type = NLA_U32 },
2623 [RTA_PRIORITY] = { .type = NLA_U32 },
2624 [RTA_METRICS] = { .type = NLA_NESTED },
2625 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2626 [RTA_PREF] = { .type = NLA_U8 },
2627 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2628 [RTA_ENCAP] = { .type = NLA_NESTED },
2631 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2632 struct fib6_config *cfg)
2635 struct nlattr *tb[RTA_MAX+1];
2639 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2644 rtm = nlmsg_data(nlh);
2645 memset(cfg, 0, sizeof(*cfg));
2647 cfg->fc_table = rtm->rtm_table;
2648 cfg->fc_dst_len = rtm->rtm_dst_len;
2649 cfg->fc_src_len = rtm->rtm_src_len;
2650 cfg->fc_flags = RTF_UP;
2651 cfg->fc_protocol = rtm->rtm_protocol;
2652 cfg->fc_type = rtm->rtm_type;
2654 if (rtm->rtm_type == RTN_UNREACHABLE ||
2655 rtm->rtm_type == RTN_BLACKHOLE ||
2656 rtm->rtm_type == RTN_PROHIBIT ||
2657 rtm->rtm_type == RTN_THROW)
2658 cfg->fc_flags |= RTF_REJECT;
2660 if (rtm->rtm_type == RTN_LOCAL)
2661 cfg->fc_flags |= RTF_LOCAL;
2663 if (rtm->rtm_flags & RTM_F_CLONED)
2664 cfg->fc_flags |= RTF_CACHE;
2666 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2667 cfg->fc_nlinfo.nlh = nlh;
2668 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2670 if (tb[RTA_GATEWAY]) {
2671 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2672 cfg->fc_flags |= RTF_GATEWAY;
2676 int plen = (rtm->rtm_dst_len + 7) >> 3;
2678 if (nla_len(tb[RTA_DST]) < plen)
2681 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2685 int plen = (rtm->rtm_src_len + 7) >> 3;
2687 if (nla_len(tb[RTA_SRC]) < plen)
2690 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2693 if (tb[RTA_PREFSRC])
2694 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2697 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2699 if (tb[RTA_PRIORITY])
2700 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2702 if (tb[RTA_METRICS]) {
2703 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2704 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2708 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2710 if (tb[RTA_MULTIPATH]) {
2711 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2712 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2716 pref = nla_get_u8(tb[RTA_PREF]);
2717 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2718 pref != ICMPV6_ROUTER_PREF_HIGH)
2719 pref = ICMPV6_ROUTER_PREF_MEDIUM;
2720 cfg->fc_flags |= RTF_PREF(pref);
2724 cfg->fc_encap = tb[RTA_ENCAP];
2726 if (tb[RTA_ENCAP_TYPE])
2727 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2734 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2736 struct fib6_config r_cfg;
2737 struct rtnexthop *rtnh;
2740 int err = 0, last_err = 0;
2742 remaining = cfg->fc_mp_len;
2744 rtnh = (struct rtnexthop *)cfg->fc_mp;
2746 /* Parse a Multipath Entry */
2747 while (rtnh_ok(rtnh, remaining)) {
2748 memcpy(&r_cfg, cfg, sizeof(*cfg));
2749 if (rtnh->rtnh_ifindex)
2750 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2752 attrlen = rtnh_attrlen(rtnh);
2754 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2756 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2758 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2759 r_cfg.fc_flags |= RTF_GATEWAY;
2761 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2762 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2764 r_cfg.fc_encap_type = nla_get_u16(nla);
2766 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2769 /* If we are trying to remove a route, do not stop the
2770 * loop when ip6_route_del() fails (because next hop is
2771 * already gone), we should try to remove all next hops.
2774 /* If add fails, we should try to delete all
2775 * next hops that have been already added.
2778 remaining = cfg->fc_mp_len - remaining;
2782 /* Because each route is added like a single route we remove
2783 * these flags after the first nexthop: if there is a collision,
2784 * we have already failed to add the first nexthop:
2785 * fib6_add_rt2node() has rejected it; when replacing, old
2786 * nexthops have been replaced by first new, the rest should
2789 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2791 rtnh = rtnh_next(rtnh, &remaining);
2797 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2799 struct fib6_config cfg;
2802 err = rtm_to_fib6_config(skb, nlh, &cfg);
2807 return ip6_route_multipath(&cfg, 0);
2809 return ip6_route_del(&cfg);
2812 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2814 struct fib6_config cfg;
2817 err = rtm_to_fib6_config(skb, nlh, &cfg);
2822 return ip6_route_multipath(&cfg, 1);
2824 return ip6_route_add(&cfg);
2827 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
2829 return NLMSG_ALIGN(sizeof(struct rtmsg))
2830 + nla_total_size(16) /* RTA_SRC */
2831 + nla_total_size(16) /* RTA_DST */
2832 + nla_total_size(16) /* RTA_GATEWAY */
2833 + nla_total_size(16) /* RTA_PREFSRC */
2834 + nla_total_size(4) /* RTA_TABLE */
2835 + nla_total_size(4) /* RTA_IIF */
2836 + nla_total_size(4) /* RTA_OIF */
2837 + nla_total_size(4) /* RTA_PRIORITY */
2838 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2839 + nla_total_size(sizeof(struct rta_cacheinfo))
2840 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2841 + nla_total_size(1) /* RTA_PREF */
2842 + lwtunnel_get_encap_size(rt->dst.lwtstate);
2845 static int rt6_fill_node(struct net *net,
2846 struct sk_buff *skb, struct rt6_info *rt,
2847 struct in6_addr *dst, struct in6_addr *src,
2848 int iif, int type, u32 portid, u32 seq,
2849 int prefix, int nowait, unsigned int flags)
2851 u32 metrics[RTAX_MAX];
2853 struct nlmsghdr *nlh;
2857 if (prefix) { /* user wants prefix routes only */
2858 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2859 /* success since this is not a prefix route */
2864 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2868 rtm = nlmsg_data(nlh);
2869 rtm->rtm_family = AF_INET6;
2870 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2871 rtm->rtm_src_len = rt->rt6i_src.plen;
2874 table = rt->rt6i_table->tb6_id;
2876 table = RT6_TABLE_UNSPEC;
2877 rtm->rtm_table = table;
2878 if (nla_put_u32(skb, RTA_TABLE, table))
2879 goto nla_put_failure;
2880 if (rt->rt6i_flags & RTF_REJECT) {
2881 switch (rt->dst.error) {
2883 rtm->rtm_type = RTN_BLACKHOLE;
2886 rtm->rtm_type = RTN_PROHIBIT;
2889 rtm->rtm_type = RTN_THROW;
2892 rtm->rtm_type = RTN_UNREACHABLE;
2896 else if (rt->rt6i_flags & RTF_LOCAL)
2897 rtm->rtm_type = RTN_LOCAL;
2898 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2899 rtm->rtm_type = RTN_LOCAL;
2901 rtm->rtm_type = RTN_UNICAST;
2903 if (!netif_carrier_ok(rt->dst.dev)) {
2904 rtm->rtm_flags |= RTNH_F_LINKDOWN;
2905 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
2906 rtm->rtm_flags |= RTNH_F_DEAD;
2908 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2909 rtm->rtm_protocol = rt->rt6i_protocol;
2910 if (rt->rt6i_flags & RTF_DYNAMIC)
2911 rtm->rtm_protocol = RTPROT_REDIRECT;
2912 else if (rt->rt6i_flags & RTF_ADDRCONF) {
2913 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2914 rtm->rtm_protocol = RTPROT_RA;
2916 rtm->rtm_protocol = RTPROT_KERNEL;
2919 if (rt->rt6i_flags & RTF_CACHE)
2920 rtm->rtm_flags |= RTM_F_CLONED;
2923 if (nla_put_in6_addr(skb, RTA_DST, dst))
2924 goto nla_put_failure;
2925 rtm->rtm_dst_len = 128;
2926 } else if (rtm->rtm_dst_len)
2927 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2928 goto nla_put_failure;
2929 #ifdef CONFIG_IPV6_SUBTREES
2931 if (nla_put_in6_addr(skb, RTA_SRC, src))
2932 goto nla_put_failure;
2933 rtm->rtm_src_len = 128;
2934 } else if (rtm->rtm_src_len &&
2935 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2936 goto nla_put_failure;
2939 #ifdef CONFIG_IPV6_MROUTE
2940 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2941 int err = ip6mr_get_route(net, skb, rtm, nowait);
2946 goto nla_put_failure;
2948 if (err == -EMSGSIZE)
2949 goto nla_put_failure;
2954 if (nla_put_u32(skb, RTA_IIF, iif))
2955 goto nla_put_failure;
2957 struct in6_addr saddr_buf;
2958 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2959 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2960 goto nla_put_failure;
2963 if (rt->rt6i_prefsrc.plen) {
2964 struct in6_addr saddr_buf;
2965 saddr_buf = rt->rt6i_prefsrc.addr;
2966 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2967 goto nla_put_failure;
2970 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2972 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2973 if (rtnetlink_put_metrics(skb, metrics) < 0)
2974 goto nla_put_failure;
2976 if (rt->rt6i_flags & RTF_GATEWAY) {
2977 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2978 goto nla_put_failure;
2982 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2983 goto nla_put_failure;
2984 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2985 goto nla_put_failure;
2987 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2989 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2990 goto nla_put_failure;
2992 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2993 goto nla_put_failure;
2995 lwtunnel_fill_encap(skb, rt->dst.lwtstate);
2997 nlmsg_end(skb, nlh);
3001 nlmsg_cancel(skb, nlh);
3005 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3007 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3010 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3011 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3012 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3016 return rt6_fill_node(arg->net,
3017 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3018 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3019 prefix, 0, NLM_F_MULTI);
3022 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3024 struct net *net = sock_net(in_skb->sk);
3025 struct nlattr *tb[RTA_MAX+1];
3026 struct rt6_info *rt;
3027 struct sk_buff *skb;
3030 int err, iif = 0, oif = 0;
3032 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3037 memset(&fl6, 0, sizeof(fl6));
3040 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3043 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3047 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3050 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3054 iif = nla_get_u32(tb[RTA_IIF]);
3057 oif = nla_get_u32(tb[RTA_OIF]);
3060 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3063 struct net_device *dev;
3066 dev = __dev_get_by_index(net, iif);
3072 fl6.flowi6_iif = iif;
3074 if (!ipv6_addr_any(&fl6.saddr))
3075 flags |= RT6_LOOKUP_F_HAS_SADDR;
3077 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3080 fl6.flowi6_oif = oif;
3082 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3085 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3092 /* Reserve room for dummy headers, this skb can pass
3093 through good chunk of routing engine.
3095 skb_reset_mac_header(skb);
3096 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3098 skb_dst_set(skb, &rt->dst);
3100 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3101 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3102 nlh->nlmsg_seq, 0, 0, 0);
3108 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3113 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3115 struct sk_buff *skb;
3116 struct net *net = info->nl_net;
3121 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3123 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3127 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3128 event, info->portid, seq, 0, 0, 0);
3130 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3131 WARN_ON(err == -EMSGSIZE);
3135 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3136 info->nlh, gfp_any());
3140 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3143 static int ip6_route_dev_notify(struct notifier_block *this,
3144 unsigned long event, void *ptr)
3146 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3147 struct net *net = dev_net(dev);
3149 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3150 net->ipv6.ip6_null_entry->dst.dev = dev;
3151 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3152 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3153 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3154 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3155 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3156 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3167 #ifdef CONFIG_PROC_FS
3169 static const struct file_operations ipv6_route_proc_fops = {
3170 .owner = THIS_MODULE,
3171 .open = ipv6_route_open,
3173 .llseek = seq_lseek,
3174 .release = seq_release_net,
3177 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3179 struct net *net = (struct net *)seq->private;
3180 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3181 net->ipv6.rt6_stats->fib_nodes,
3182 net->ipv6.rt6_stats->fib_route_nodes,
3183 net->ipv6.rt6_stats->fib_rt_alloc,
3184 net->ipv6.rt6_stats->fib_rt_entries,
3185 net->ipv6.rt6_stats->fib_rt_cache,
3186 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3187 net->ipv6.rt6_stats->fib_discarded_routes);
3192 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3194 return single_open_net(inode, file, rt6_stats_seq_show);
3197 static const struct file_operations rt6_stats_seq_fops = {
3198 .owner = THIS_MODULE,
3199 .open = rt6_stats_seq_open,
3201 .llseek = seq_lseek,
3202 .release = single_release_net,
3204 #endif /* CONFIG_PROC_FS */
3206 #ifdef CONFIG_SYSCTL
3209 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3210 void __user *buffer, size_t *lenp, loff_t *ppos)
3217 net = (struct net *)ctl->extra1;
3218 delay = net->ipv6.sysctl.flush_delay;
3219 proc_dointvec(ctl, write, buffer, lenp, ppos);
3220 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3224 struct ctl_table ipv6_route_table_template[] = {
3226 .procname = "flush",
3227 .data = &init_net.ipv6.sysctl.flush_delay,
3228 .maxlen = sizeof(int),
3230 .proc_handler = ipv6_sysctl_rtcache_flush
3233 .procname = "gc_thresh",
3234 .data = &ip6_dst_ops_template.gc_thresh,
3235 .maxlen = sizeof(int),
3237 .proc_handler = proc_dointvec,
3240 .procname = "max_size",
3241 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3242 .maxlen = sizeof(int),
3244 .proc_handler = proc_dointvec,
3247 .procname = "gc_min_interval",
3248 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3249 .maxlen = sizeof(int),
3251 .proc_handler = proc_dointvec_jiffies,
3254 .procname = "gc_timeout",
3255 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3256 .maxlen = sizeof(int),
3258 .proc_handler = proc_dointvec_jiffies,
3261 .procname = "gc_interval",
3262 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3263 .maxlen = sizeof(int),
3265 .proc_handler = proc_dointvec_jiffies,
3268 .procname = "gc_elasticity",
3269 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3270 .maxlen = sizeof(int),
3272 .proc_handler = proc_dointvec,
3275 .procname = "mtu_expires",
3276 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3277 .maxlen = sizeof(int),
3279 .proc_handler = proc_dointvec_jiffies,
3282 .procname = "min_adv_mss",
3283 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3284 .maxlen = sizeof(int),
3286 .proc_handler = proc_dointvec,
3289 .procname = "gc_min_interval_ms",
3290 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3291 .maxlen = sizeof(int),
3293 .proc_handler = proc_dointvec_ms_jiffies,
3298 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3300 struct ctl_table *table;
3302 table = kmemdup(ipv6_route_table_template,
3303 sizeof(ipv6_route_table_template),
3307 table[0].data = &net->ipv6.sysctl.flush_delay;
3308 table[0].extra1 = net;
3309 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3310 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3311 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3312 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3313 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3314 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3315 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3316 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3317 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3319 /* Don't export sysctls to unprivileged users */
3320 if (net->user_ns != &init_user_ns)
3321 table[0].procname = NULL;
3328 static int __net_init ip6_route_net_init(struct net *net)
3332 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3333 sizeof(net->ipv6.ip6_dst_ops));
3335 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3336 goto out_ip6_dst_ops;
3338 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3339 sizeof(*net->ipv6.ip6_null_entry),
3341 if (!net->ipv6.ip6_null_entry)
3342 goto out_ip6_dst_entries;
3343 net->ipv6.ip6_null_entry->dst.path =
3344 (struct dst_entry *)net->ipv6.ip6_null_entry;
3345 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3346 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3347 ip6_template_metrics, true);
3349 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3350 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3351 sizeof(*net->ipv6.ip6_prohibit_entry),
3353 if (!net->ipv6.ip6_prohibit_entry)
3354 goto out_ip6_null_entry;
3355 net->ipv6.ip6_prohibit_entry->dst.path =
3356 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3357 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3358 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3359 ip6_template_metrics, true);
3361 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3362 sizeof(*net->ipv6.ip6_blk_hole_entry),
3364 if (!net->ipv6.ip6_blk_hole_entry)
3365 goto out_ip6_prohibit_entry;
3366 net->ipv6.ip6_blk_hole_entry->dst.path =
3367 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3368 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3369 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3370 ip6_template_metrics, true);
3373 net->ipv6.sysctl.flush_delay = 0;
3374 net->ipv6.sysctl.ip6_rt_max_size = 4096;
3375 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3376 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3377 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3378 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3379 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3380 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3382 net->ipv6.ip6_rt_gc_expire = 30*HZ;
3388 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3389 out_ip6_prohibit_entry:
3390 kfree(net->ipv6.ip6_prohibit_entry);
3392 kfree(net->ipv6.ip6_null_entry);
3394 out_ip6_dst_entries:
3395 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3400 static void __net_exit ip6_route_net_exit(struct net *net)
3402 kfree(net->ipv6.ip6_null_entry);
3403 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3404 kfree(net->ipv6.ip6_prohibit_entry);
3405 kfree(net->ipv6.ip6_blk_hole_entry);
3407 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3410 static int __net_init ip6_route_net_init_late(struct net *net)
3412 #ifdef CONFIG_PROC_FS
3413 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3414 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3419 static void __net_exit ip6_route_net_exit_late(struct net *net)
3421 #ifdef CONFIG_PROC_FS
3422 remove_proc_entry("ipv6_route", net->proc_net);
3423 remove_proc_entry("rt6_stats", net->proc_net);
3427 static struct pernet_operations ip6_route_net_ops = {
3428 .init = ip6_route_net_init,
3429 .exit = ip6_route_net_exit,
3432 static int __net_init ipv6_inetpeer_init(struct net *net)
3434 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3438 inet_peer_base_init(bp);
3439 net->ipv6.peers = bp;
3443 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3445 struct inet_peer_base *bp = net->ipv6.peers;
3447 net->ipv6.peers = NULL;
3448 inetpeer_invalidate_tree(bp);
3452 static struct pernet_operations ipv6_inetpeer_ops = {
3453 .init = ipv6_inetpeer_init,
3454 .exit = ipv6_inetpeer_exit,
3457 static struct pernet_operations ip6_route_net_late_ops = {
3458 .init = ip6_route_net_init_late,
3459 .exit = ip6_route_net_exit_late,
3462 static struct notifier_block ip6_route_dev_notifier = {
3463 .notifier_call = ip6_route_dev_notify,
3467 int __init ip6_route_init(void)
3473 ip6_dst_ops_template.kmem_cachep =
3474 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3475 SLAB_HWCACHE_ALIGN, NULL);
3476 if (!ip6_dst_ops_template.kmem_cachep)
3479 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3481 goto out_kmem_cache;
3483 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3485 goto out_dst_entries;
3487 ret = register_pernet_subsys(&ip6_route_net_ops);
3489 goto out_register_inetpeer;
3491 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3493 /* Registering of the loopback is done before this portion of code,
3494 * the loopback reference in rt6_info will not be taken, do it
3495 * manually for init_net */
3496 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3497 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3498 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3499 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3500 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3501 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3502 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3506 goto out_register_subsys;
3512 ret = fib6_rules_init();
3516 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3518 goto fib6_rules_init;
3521 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3522 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3523 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3524 goto out_register_late_subsys;
3526 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3528 goto out_register_late_subsys;
3530 for_each_possible_cpu(cpu) {
3531 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3533 INIT_LIST_HEAD(&ul->head);
3534 spin_lock_init(&ul->lock);
3540 out_register_late_subsys:
3541 unregister_pernet_subsys(&ip6_route_net_late_ops);
3543 fib6_rules_cleanup();
3548 out_register_subsys:
3549 unregister_pernet_subsys(&ip6_route_net_ops);
3550 out_register_inetpeer:
3551 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3553 dst_entries_destroy(&ip6_dst_blackhole_ops);
3555 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3559 void ip6_route_cleanup(void)
3561 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3562 unregister_pernet_subsys(&ip6_route_net_late_ops);
3563 fib6_rules_cleanup();
3566 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3567 unregister_pernet_subsys(&ip6_route_net_ops);
3568 dst_entries_destroy(&ip6_dst_blackhole_ops);
3569 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);