2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
61 #include <asm/uaccess.h>
64 #include <linux/sysctl.h>
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68 const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void ip6_dst_destroy(struct dst_entry *);
74 static void ip6_dst_ifdown(struct dst_entry *,
75 struct net_device *dev, int how);
76 static int ip6_dst_gc(struct dst_ops *ops);
78 static int ip6_pkt_discard(struct sk_buff *skb);
79 static int ip6_pkt_discard_out(struct sk_buff *skb);
80 static void ip6_link_failure(struct sk_buff *skb);
81 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85 const struct in6_addr *prefix, int prefixlen,
86 const struct in6_addr *gwaddr, int ifindex,
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89 const struct in6_addr *prefix, int prefixlen,
90 const struct in6_addr *gwaddr, int ifindex);
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
95 struct rt6_info *rt = (struct rt6_info *) dst;
96 struct inet_peer *peer;
99 if (!(rt->dst.flags & DST_HOST))
102 peer = rt6_get_peer_create(rt);
104 u32 *old_p = __DST_METRICS_PTR(old);
105 unsigned long prev, new;
108 if (inet_metrics_new(peer))
109 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111 new = (unsigned long) p;
112 prev = cmpxchg(&dst->_metrics, old, new);
115 p = __DST_METRICS_PTR(prev);
116 if (prev & DST_METRICS_READ_ONLY)
123 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
125 struct in6_addr *p = &rt->rt6i_gateway;
127 if (!ipv6_addr_any(p))
128 return (const void *) p;
132 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
134 struct rt6_info *rt = (struct rt6_info *) dst;
137 daddr = choose_neigh_daddr(rt, daddr);
138 n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
141 return neigh_create(&nd_tbl, daddr, dst->dev);
144 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
146 struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
148 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
152 dst_set_neighbour(&rt->dst, n);
157 static struct dst_ops ip6_dst_ops_template = {
159 .protocol = cpu_to_be16(ETH_P_IPV6),
162 .check = ip6_dst_check,
163 .default_advmss = ip6_default_advmss,
165 .cow_metrics = ipv6_cow_metrics,
166 .destroy = ip6_dst_destroy,
167 .ifdown = ip6_dst_ifdown,
168 .negative_advice = ip6_negative_advice,
169 .link_failure = ip6_link_failure,
170 .update_pmtu = ip6_rt_update_pmtu,
171 .local_out = __ip6_local_out,
172 .neigh_lookup = ip6_neigh_lookup,
175 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
177 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
179 return mtu ? : dst->dev->mtu;
182 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
186 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
192 static struct dst_ops ip6_dst_blackhole_ops = {
194 .protocol = cpu_to_be16(ETH_P_IPV6),
195 .destroy = ip6_dst_destroy,
196 .check = ip6_dst_check,
197 .mtu = ip6_blackhole_mtu,
198 .default_advmss = ip6_default_advmss,
199 .update_pmtu = ip6_rt_blackhole_update_pmtu,
200 .cow_metrics = ip6_rt_blackhole_cow_metrics,
201 .neigh_lookup = ip6_neigh_lookup,
204 static const u32 ip6_template_metrics[RTAX_MAX] = {
205 [RTAX_HOPLIMIT - 1] = 255,
208 static struct rt6_info ip6_null_entry_template = {
210 .__refcnt = ATOMIC_INIT(1),
213 .error = -ENETUNREACH,
214 .input = ip6_pkt_discard,
215 .output = ip6_pkt_discard_out,
217 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
218 .rt6i_protocol = RTPROT_KERNEL,
219 .rt6i_metric = ~(u32) 0,
220 .rt6i_ref = ATOMIC_INIT(1),
223 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
225 static int ip6_pkt_prohibit(struct sk_buff *skb);
226 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
228 static struct rt6_info ip6_prohibit_entry_template = {
230 .__refcnt = ATOMIC_INIT(1),
234 .input = ip6_pkt_prohibit,
235 .output = ip6_pkt_prohibit_out,
237 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
238 .rt6i_protocol = RTPROT_KERNEL,
239 .rt6i_metric = ~(u32) 0,
240 .rt6i_ref = ATOMIC_INIT(1),
243 static struct rt6_info ip6_blk_hole_entry_template = {
245 .__refcnt = ATOMIC_INIT(1),
249 .input = dst_discard,
250 .output = dst_discard,
252 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
253 .rt6i_protocol = RTPROT_KERNEL,
254 .rt6i_metric = ~(u32) 0,
255 .rt6i_ref = ATOMIC_INIT(1),
260 /* allocate dst with ip6_dst_ops */
261 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
262 struct net_device *dev,
264 struct fib6_table *table)
266 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
270 memset(&rt->rt6i_table, 0,
271 sizeof(*rt) - sizeof(struct dst_entry));
272 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
277 static void ip6_dst_destroy(struct dst_entry *dst)
279 struct rt6_info *rt = (struct rt6_info *)dst;
280 struct inet6_dev *idev = rt->rt6i_idev;
282 if (!(rt->dst.flags & DST_HOST))
283 dst_destroy_metrics_generic(dst);
286 rt->rt6i_idev = NULL;
290 if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
291 dst_release(dst->from);
293 if (rt6_has_peer(rt)) {
294 struct inet_peer *peer = rt6_peer_ptr(rt);
299 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
301 static u32 rt6_peer_genid(void)
303 return atomic_read(&__rt6_peer_genid);
306 void rt6_bind_peer(struct rt6_info *rt, int create)
308 struct inet_peer_base *base;
309 struct inet_peer *peer;
311 base = inetpeer_base_ptr(rt->_rt6i_peer);
315 peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
317 if (!rt6_set_peer(rt, peer))
320 rt->rt6i_peer_genid = rt6_peer_genid();
324 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
327 struct rt6_info *rt = (struct rt6_info *)dst;
328 struct inet6_dev *idev = rt->rt6i_idev;
329 struct net_device *loopback_dev =
330 dev_net(dev)->loopback_dev;
332 if (dev != loopback_dev && idev && idev->dev == dev) {
333 struct inet6_dev *loopback_idev =
334 in6_dev_get(loopback_dev);
336 rt->rt6i_idev = loopback_idev;
342 static bool rt6_check_expired(const struct rt6_info *rt)
344 struct rt6_info *ort = NULL;
346 if (rt->rt6i_flags & RTF_EXPIRES) {
347 if (time_after(jiffies, rt->dst.expires))
349 } else if (rt->dst.from) {
350 ort = (struct rt6_info *) rt->dst.from;
351 return (ort->rt6i_flags & RTF_EXPIRES) &&
352 time_after(jiffies, ort->dst.expires);
357 static bool rt6_need_strict(const struct in6_addr *daddr)
359 return ipv6_addr_type(daddr) &
360 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
364 * Route lookup. Any table->tb6_lock is implied.
367 static inline struct rt6_info *rt6_device_match(struct net *net,
369 const struct in6_addr *saddr,
373 struct rt6_info *local = NULL;
374 struct rt6_info *sprt;
376 if (!oif && ipv6_addr_any(saddr))
379 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
380 struct net_device *dev = sprt->dst.dev;
383 if (dev->ifindex == oif)
385 if (dev->flags & IFF_LOOPBACK) {
386 if (!sprt->rt6i_idev ||
387 sprt->rt6i_idev->dev->ifindex != oif) {
388 if (flags & RT6_LOOKUP_F_IFACE && oif)
390 if (local && (!oif ||
391 local->rt6i_idev->dev->ifindex == oif))
397 if (ipv6_chk_addr(net, saddr, dev,
398 flags & RT6_LOOKUP_F_IFACE))
407 if (flags & RT6_LOOKUP_F_IFACE)
408 return net->ipv6.ip6_null_entry;
414 #ifdef CONFIG_IPV6_ROUTER_PREF
415 static void rt6_probe(struct rt6_info *rt)
417 struct neighbour *neigh;
419 * Okay, this does not seem to be appropriate
420 * for now, however, we need to check if it
421 * is really so; aka Router Reachability Probing.
423 * Router Reachability Probe MUST be rate-limited
424 * to no more than one per minute.
427 neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
428 if (!neigh || (neigh->nud_state & NUD_VALID))
430 read_lock_bh(&neigh->lock);
431 if (!(neigh->nud_state & NUD_VALID) &&
432 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
433 struct in6_addr mcaddr;
434 struct in6_addr *target;
436 neigh->updated = jiffies;
437 read_unlock_bh(&neigh->lock);
439 target = (struct in6_addr *)&neigh->primary_key;
440 addrconf_addr_solict_mult(target, &mcaddr);
441 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
443 read_unlock_bh(&neigh->lock);
449 static inline void rt6_probe(struct rt6_info *rt)
455 * Default Router Selection (RFC 2461 6.3.6)
457 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
459 struct net_device *dev = rt->dst.dev;
460 if (!oif || dev->ifindex == oif)
462 if ((dev->flags & IFF_LOOPBACK) &&
463 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
468 static inline int rt6_check_neigh(struct rt6_info *rt)
470 struct neighbour *neigh;
474 neigh = dst_get_neighbour_noref(&rt->dst);
475 if (rt->rt6i_flags & RTF_NONEXTHOP ||
476 !(rt->rt6i_flags & RTF_GATEWAY))
479 read_lock_bh(&neigh->lock);
480 if (neigh->nud_state & NUD_VALID)
482 #ifdef CONFIG_IPV6_ROUTER_PREF
483 else if (neigh->nud_state & NUD_FAILED)
488 read_unlock_bh(&neigh->lock);
495 static int rt6_score_route(struct rt6_info *rt, int oif,
500 m = rt6_check_dev(rt, oif);
501 if (!m && (strict & RT6_LOOKUP_F_IFACE))
503 #ifdef CONFIG_IPV6_ROUTER_PREF
504 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
506 n = rt6_check_neigh(rt);
507 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
512 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
513 int *mpri, struct rt6_info *match)
517 if (rt6_check_expired(rt))
520 m = rt6_score_route(rt, oif, strict);
525 if (strict & RT6_LOOKUP_F_REACHABLE)
529 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
537 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
538 struct rt6_info *rr_head,
539 u32 metric, int oif, int strict)
541 struct rt6_info *rt, *match;
545 for (rt = rr_head; rt && rt->rt6i_metric == metric;
546 rt = rt->dst.rt6_next)
547 match = find_match(rt, oif, strict, &mpri, match);
548 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
549 rt = rt->dst.rt6_next)
550 match = find_match(rt, oif, strict, &mpri, match);
555 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
557 struct rt6_info *match, *rt0;
562 fn->rr_ptr = rt0 = fn->leaf;
564 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
567 (strict & RT6_LOOKUP_F_REACHABLE)) {
568 struct rt6_info *next = rt0->dst.rt6_next;
570 /* no entries matched; do round-robin */
571 if (!next || next->rt6i_metric != rt0->rt6i_metric)
578 net = dev_net(rt0->dst.dev);
579 return match ? match : net->ipv6.ip6_null_entry;
582 #ifdef CONFIG_IPV6_ROUTE_INFO
583 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
584 const struct in6_addr *gwaddr)
586 struct net *net = dev_net(dev);
587 struct route_info *rinfo = (struct route_info *) opt;
588 struct in6_addr prefix_buf, *prefix;
590 unsigned long lifetime;
593 if (len < sizeof(struct route_info)) {
597 /* Sanity check for prefix_len and length */
598 if (rinfo->length > 3) {
600 } else if (rinfo->prefix_len > 128) {
602 } else if (rinfo->prefix_len > 64) {
603 if (rinfo->length < 2) {
606 } else if (rinfo->prefix_len > 0) {
607 if (rinfo->length < 1) {
612 pref = rinfo->route_pref;
613 if (pref == ICMPV6_ROUTER_PREF_INVALID)
616 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
618 if (rinfo->length == 3)
619 prefix = (struct in6_addr *)rinfo->prefix;
621 /* this function is safe */
622 ipv6_addr_prefix(&prefix_buf,
623 (struct in6_addr *)rinfo->prefix,
625 prefix = &prefix_buf;
628 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
631 if (rt && !lifetime) {
637 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
640 rt->rt6i_flags = RTF_ROUTEINFO |
641 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
644 if (!addrconf_finite_timeout(lifetime))
645 rt6_clean_expires(rt);
647 rt6_set_expires(rt, jiffies + HZ * lifetime);
649 dst_release(&rt->dst);
655 #define BACKTRACK(__net, saddr) \
657 if (rt == __net->ipv6.ip6_null_entry) { \
658 struct fib6_node *pn; \
660 if (fn->fn_flags & RTN_TL_ROOT) \
663 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
664 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
667 if (fn->fn_flags & RTN_RTINFO) \
673 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
674 struct fib6_table *table,
675 struct flowi6 *fl6, int flags)
677 struct fib6_node *fn;
680 read_lock_bh(&table->tb6_lock);
681 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
684 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
685 BACKTRACK(net, &fl6->saddr);
687 dst_use(&rt->dst, jiffies);
688 read_unlock_bh(&table->tb6_lock);
693 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
696 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
698 EXPORT_SYMBOL_GPL(ip6_route_lookup);
700 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
701 const struct in6_addr *saddr, int oif, int strict)
703 struct flowi6 fl6 = {
707 struct dst_entry *dst;
708 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
711 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
712 flags |= RT6_LOOKUP_F_HAS_SADDR;
715 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
717 return (struct rt6_info *) dst;
724 EXPORT_SYMBOL(rt6_lookup);
726 /* ip6_ins_rt is called with FREE table->tb6_lock.
727 It takes new route entry, the addition fails by any reason the
728 route is freed. In any case, if caller does not hold it, it may
732 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
735 struct fib6_table *table;
737 table = rt->rt6i_table;
738 write_lock_bh(&table->tb6_lock);
739 err = fib6_add(&table->tb6_root, rt, info);
740 write_unlock_bh(&table->tb6_lock);
745 int ip6_ins_rt(struct rt6_info *rt)
747 struct nl_info info = {
748 .nl_net = dev_net(rt->dst.dev),
750 return __ip6_ins_rt(rt, &info);
753 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
754 const struct in6_addr *daddr,
755 const struct in6_addr *saddr)
763 rt = ip6_rt_copy(ort, daddr);
766 int attempts = !in_softirq();
768 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
769 if (ort->rt6i_dst.plen != 128 &&
770 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
771 rt->rt6i_flags |= RTF_ANYCAST;
772 rt->rt6i_gateway = *daddr;
775 rt->rt6i_flags |= RTF_CACHE;
777 #ifdef CONFIG_IPV6_SUBTREES
778 if (rt->rt6i_src.plen && saddr) {
779 rt->rt6i_src.addr = *saddr;
780 rt->rt6i_src.plen = 128;
785 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
786 struct net *net = dev_net(rt->dst.dev);
787 int saved_rt_min_interval =
788 net->ipv6.sysctl.ip6_rt_gc_min_interval;
789 int saved_rt_elasticity =
790 net->ipv6.sysctl.ip6_rt_gc_elasticity;
792 if (attempts-- > 0) {
793 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
794 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
796 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
798 net->ipv6.sysctl.ip6_rt_gc_elasticity =
800 net->ipv6.sysctl.ip6_rt_gc_min_interval =
801 saved_rt_min_interval;
805 net_warn_ratelimited("Neighbour table overflow\n");
814 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
815 const struct in6_addr *daddr)
817 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
820 rt->rt6i_flags |= RTF_CACHE;
821 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
826 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
827 struct flowi6 *fl6, int flags)
829 struct fib6_node *fn;
830 struct rt6_info *rt, *nrt;
834 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
836 strict |= flags & RT6_LOOKUP_F_IFACE;
839 read_lock_bh(&table->tb6_lock);
842 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
845 rt = rt6_select(fn, oif, strict | reachable);
847 BACKTRACK(net, &fl6->saddr);
848 if (rt == net->ipv6.ip6_null_entry ||
849 rt->rt6i_flags & RTF_CACHE)
853 read_unlock_bh(&table->tb6_lock);
855 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
856 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
857 else if (!(rt->dst.flags & DST_HOST))
858 nrt = rt6_alloc_clone(rt, &fl6->daddr);
862 dst_release(&rt->dst);
863 rt = nrt ? : net->ipv6.ip6_null_entry;
867 err = ip6_ins_rt(nrt);
876 * Race condition! In the gap, when table->tb6_lock was
877 * released someone could insert this route. Relookup.
879 dst_release(&rt->dst);
888 read_unlock_bh(&table->tb6_lock);
890 rt->dst.lastuse = jiffies;
896 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
897 struct flowi6 *fl6, int flags)
899 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
902 static struct dst_entry *ip6_route_input_lookup(struct net *net,
903 struct net_device *dev,
904 struct flowi6 *fl6, int flags)
906 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
907 flags |= RT6_LOOKUP_F_IFACE;
909 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
912 void ip6_route_input(struct sk_buff *skb)
914 const struct ipv6hdr *iph = ipv6_hdr(skb);
915 struct net *net = dev_net(skb->dev);
916 int flags = RT6_LOOKUP_F_HAS_SADDR;
917 struct flowi6 fl6 = {
918 .flowi6_iif = skb->dev->ifindex,
921 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
922 .flowi6_mark = skb->mark,
923 .flowi6_proto = iph->nexthdr,
926 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
929 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
930 struct flowi6 *fl6, int flags)
932 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
935 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
940 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
941 flags |= RT6_LOOKUP_F_IFACE;
943 if (!ipv6_addr_any(&fl6->saddr))
944 flags |= RT6_LOOKUP_F_HAS_SADDR;
946 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
948 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
951 EXPORT_SYMBOL(ip6_route_output);
953 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
955 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
956 struct dst_entry *new = NULL;
958 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
960 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
961 rt6_init_peer(rt, net->ipv6.peers);
966 new->input = dst_discard;
967 new->output = dst_discard;
969 if (dst_metrics_read_only(&ort->dst))
970 new->_metrics = ort->dst._metrics;
972 dst_copy_metrics(new, &ort->dst);
973 rt->rt6i_idev = ort->rt6i_idev;
975 in6_dev_hold(rt->rt6i_idev);
977 rt->rt6i_gateway = ort->rt6i_gateway;
978 rt->rt6i_flags = ort->rt6i_flags;
979 rt6_clean_expires(rt);
982 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
983 #ifdef CONFIG_IPV6_SUBTREES
984 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
990 dst_release(dst_orig);
991 return new ? new : ERR_PTR(-ENOMEM);
995 * Destination cache support functions
998 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1000 struct rt6_info *rt;
1002 rt = (struct rt6_info *) dst;
1004 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1005 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1006 if (!rt6_has_peer(rt))
1007 rt6_bind_peer(rt, 0);
1008 rt->rt6i_peer_genid = rt6_peer_genid();
1015 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1017 struct rt6_info *rt = (struct rt6_info *) dst;
1020 if (rt->rt6i_flags & RTF_CACHE) {
1021 if (rt6_check_expired(rt)) {
1033 static void ip6_link_failure(struct sk_buff *skb)
1035 struct rt6_info *rt;
1037 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1039 rt = (struct rt6_info *) skb_dst(skb);
1041 if (rt->rt6i_flags & RTF_CACHE)
1042 rt6_update_expires(rt, 0);
1043 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1044 rt->rt6i_node->fn_sernum = -1;
1048 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1050 struct rt6_info *rt6 = (struct rt6_info*)dst;
1053 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1054 struct net *net = dev_net(dst->dev);
1056 rt6->rt6i_flags |= RTF_MODIFIED;
1057 if (mtu < IPV6_MIN_MTU) {
1058 u32 features = dst_metric(dst, RTAX_FEATURES);
1060 features |= RTAX_FEATURE_ALLFRAG;
1061 dst_metric_set(dst, RTAX_FEATURES, features);
1063 dst_metric_set(dst, RTAX_MTU, mtu);
1064 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1068 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1071 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1072 struct dst_entry *dst;
1075 memset(&fl6, 0, sizeof(fl6));
1076 fl6.flowi6_oif = oif;
1077 fl6.flowi6_mark = mark;
1078 fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS;
1079 fl6.daddr = iph->daddr;
1080 fl6.saddr = iph->saddr;
1081 fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1083 dst = ip6_route_output(net, NULL, &fl6);
1085 ip6_rt_update_pmtu(dst, ntohl(mtu));
1088 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1090 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1092 ip6_update_pmtu(skb, sock_net(sk), mtu,
1093 sk->sk_bound_dev_if, sk->sk_mark);
1095 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1097 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1099 struct net_device *dev = dst->dev;
1100 unsigned int mtu = dst_mtu(dst);
1101 struct net *net = dev_net(dev);
1103 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1105 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1106 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1109 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1110 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1111 * IPV6_MAXPLEN is also valid and means: "any MSS,
1112 * rely only on pmtu discovery"
1114 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1119 static unsigned int ip6_mtu(const struct dst_entry *dst)
1121 struct inet6_dev *idev;
1122 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1130 idev = __in6_dev_get(dst->dev);
1132 mtu = idev->cnf.mtu6;
1138 static struct dst_entry *icmp6_dst_gc_list;
1139 static DEFINE_SPINLOCK(icmp6_dst_lock);
1141 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1142 struct neighbour *neigh,
1145 struct dst_entry *dst;
1146 struct rt6_info *rt;
1147 struct inet6_dev *idev = in6_dev_get(dev);
1148 struct net *net = dev_net(dev);
1150 if (unlikely(!idev))
1151 return ERR_PTR(-ENODEV);
1153 rt = ip6_dst_alloc(net, dev, 0, NULL);
1154 if (unlikely(!rt)) {
1156 dst = ERR_PTR(-ENOMEM);
1163 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1164 if (IS_ERR(neigh)) {
1167 return ERR_CAST(neigh);
1171 rt->dst.flags |= DST_HOST;
1172 rt->dst.output = ip6_output;
1173 dst_set_neighbour(&rt->dst, neigh);
1174 atomic_set(&rt->dst.__refcnt, 1);
1175 rt->rt6i_dst.addr = fl6->daddr;
1176 rt->rt6i_dst.plen = 128;
1177 rt->rt6i_idev = idev;
1178 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1180 spin_lock_bh(&icmp6_dst_lock);
1181 rt->dst.next = icmp6_dst_gc_list;
1182 icmp6_dst_gc_list = &rt->dst;
1183 spin_unlock_bh(&icmp6_dst_lock);
1185 fib6_force_start_gc(net);
1187 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1193 int icmp6_dst_gc(void)
1195 struct dst_entry *dst, **pprev;
1198 spin_lock_bh(&icmp6_dst_lock);
1199 pprev = &icmp6_dst_gc_list;
1201 while ((dst = *pprev) != NULL) {
1202 if (!atomic_read(&dst->__refcnt)) {
1211 spin_unlock_bh(&icmp6_dst_lock);
1216 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1219 struct dst_entry *dst, **pprev;
1221 spin_lock_bh(&icmp6_dst_lock);
1222 pprev = &icmp6_dst_gc_list;
1223 while ((dst = *pprev) != NULL) {
1224 struct rt6_info *rt = (struct rt6_info *) dst;
1225 if (func(rt, arg)) {
1232 spin_unlock_bh(&icmp6_dst_lock);
1235 static int ip6_dst_gc(struct dst_ops *ops)
1237 unsigned long now = jiffies;
1238 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1239 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1240 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1241 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1242 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1243 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1246 entries = dst_entries_get_fast(ops);
1247 if (time_after(rt_last_gc + rt_min_interval, now) &&
1248 entries <= rt_max_size)
1251 net->ipv6.ip6_rt_gc_expire++;
1252 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1253 net->ipv6.ip6_rt_last_gc = now;
1254 entries = dst_entries_get_slow(ops);
1255 if (entries < ops->gc_thresh)
1256 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1258 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1259 return entries > rt_max_size;
1262 /* Clean host part of a prefix. Not necessary in radix tree,
1263 but results in cleaner routing tables.
1265 Remove it only when all the things will work!
1268 int ip6_dst_hoplimit(struct dst_entry *dst)
1270 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1271 if (hoplimit == 0) {
1272 struct net_device *dev = dst->dev;
1273 struct inet6_dev *idev;
1276 idev = __in6_dev_get(dev);
1278 hoplimit = idev->cnf.hop_limit;
1280 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1285 EXPORT_SYMBOL(ip6_dst_hoplimit);
1291 int ip6_route_add(struct fib6_config *cfg)
1294 struct net *net = cfg->fc_nlinfo.nl_net;
1295 struct rt6_info *rt = NULL;
1296 struct net_device *dev = NULL;
1297 struct inet6_dev *idev = NULL;
1298 struct fib6_table *table;
1301 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1303 #ifndef CONFIG_IPV6_SUBTREES
1304 if (cfg->fc_src_len)
1307 if (cfg->fc_ifindex) {
1309 dev = dev_get_by_index(net, cfg->fc_ifindex);
1312 idev = in6_dev_get(dev);
1317 if (cfg->fc_metric == 0)
1318 cfg->fc_metric = IP6_RT_PRIO_USER;
1321 if (cfg->fc_nlinfo.nlh &&
1322 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1323 table = fib6_get_table(net, cfg->fc_table);
1325 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1326 table = fib6_new_table(net, cfg->fc_table);
1329 table = fib6_new_table(net, cfg->fc_table);
1335 rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1342 rt->dst.obsolete = -1;
1344 if (cfg->fc_flags & RTF_EXPIRES)
1345 rt6_set_expires(rt, jiffies +
1346 clock_t_to_jiffies(cfg->fc_expires));
1348 rt6_clean_expires(rt);
1350 if (cfg->fc_protocol == RTPROT_UNSPEC)
1351 cfg->fc_protocol = RTPROT_BOOT;
1352 rt->rt6i_protocol = cfg->fc_protocol;
1354 addr_type = ipv6_addr_type(&cfg->fc_dst);
1356 if (addr_type & IPV6_ADDR_MULTICAST)
1357 rt->dst.input = ip6_mc_input;
1358 else if (cfg->fc_flags & RTF_LOCAL)
1359 rt->dst.input = ip6_input;
1361 rt->dst.input = ip6_forward;
1363 rt->dst.output = ip6_output;
1365 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1366 rt->rt6i_dst.plen = cfg->fc_dst_len;
1367 if (rt->rt6i_dst.plen == 128)
1368 rt->dst.flags |= DST_HOST;
1370 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1371 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1376 dst_init_metrics(&rt->dst, metrics, 0);
1378 #ifdef CONFIG_IPV6_SUBTREES
1379 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1380 rt->rt6i_src.plen = cfg->fc_src_len;
1383 rt->rt6i_metric = cfg->fc_metric;
1385 /* We cannot add true routes via loopback here,
1386 they would result in kernel looping; promote them to reject routes
1388 if ((cfg->fc_flags & RTF_REJECT) ||
1389 (dev && (dev->flags & IFF_LOOPBACK) &&
1390 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1391 !(cfg->fc_flags & RTF_LOCAL))) {
1392 /* hold loopback dev/idev if we haven't done so. */
1393 if (dev != net->loopback_dev) {
1398 dev = net->loopback_dev;
1400 idev = in6_dev_get(dev);
1406 rt->dst.output = ip6_pkt_discard_out;
1407 rt->dst.input = ip6_pkt_discard;
1408 rt->dst.error = -ENETUNREACH;
1409 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1413 if (cfg->fc_flags & RTF_GATEWAY) {
1414 const struct in6_addr *gw_addr;
1417 gw_addr = &cfg->fc_gateway;
1418 rt->rt6i_gateway = *gw_addr;
1419 gwa_type = ipv6_addr_type(gw_addr);
1421 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1422 struct rt6_info *grt;
1424 /* IPv6 strictly inhibits using not link-local
1425 addresses as nexthop address.
1426 Otherwise, router will not able to send redirects.
1427 It is very good, but in some (rare!) circumstances
1428 (SIT, PtP, NBMA NOARP links) it is handy to allow
1429 some exceptions. --ANK
1432 if (!(gwa_type & IPV6_ADDR_UNICAST))
1435 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1437 err = -EHOSTUNREACH;
1441 if (dev != grt->dst.dev) {
1442 dst_release(&grt->dst);
1447 idev = grt->rt6i_idev;
1449 in6_dev_hold(grt->rt6i_idev);
1451 if (!(grt->rt6i_flags & RTF_GATEWAY))
1453 dst_release(&grt->dst);
1459 if (!dev || (dev->flags & IFF_LOOPBACK))
1467 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1468 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1472 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1473 rt->rt6i_prefsrc.plen = 128;
1475 rt->rt6i_prefsrc.plen = 0;
1477 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1478 err = rt6_bind_neighbour(rt, dev);
1483 rt->rt6i_flags = cfg->fc_flags;
1490 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1491 int type = nla_type(nla);
1494 if (type > RTAX_MAX) {
1499 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1505 rt->rt6i_idev = idev;
1506 rt->rt6i_table = table;
1508 cfg->fc_nlinfo.nl_net = dev_net(dev);
1510 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1522 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1525 struct fib6_table *table;
1526 struct net *net = dev_net(rt->dst.dev);
1528 if (rt == net->ipv6.ip6_null_entry)
1531 table = rt->rt6i_table;
1532 write_lock_bh(&table->tb6_lock);
1534 err = fib6_del(rt, info);
1535 dst_release(&rt->dst);
1537 write_unlock_bh(&table->tb6_lock);
1542 int ip6_del_rt(struct rt6_info *rt)
1544 struct nl_info info = {
1545 .nl_net = dev_net(rt->dst.dev),
1547 return __ip6_del_rt(rt, &info);
1550 static int ip6_route_del(struct fib6_config *cfg)
1552 struct fib6_table *table;
1553 struct fib6_node *fn;
1554 struct rt6_info *rt;
1557 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1561 read_lock_bh(&table->tb6_lock);
1563 fn = fib6_locate(&table->tb6_root,
1564 &cfg->fc_dst, cfg->fc_dst_len,
1565 &cfg->fc_src, cfg->fc_src_len);
1568 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1569 if (cfg->fc_ifindex &&
1571 rt->dst.dev->ifindex != cfg->fc_ifindex))
1573 if (cfg->fc_flags & RTF_GATEWAY &&
1574 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1576 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1579 read_unlock_bh(&table->tb6_lock);
1581 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1584 read_unlock_bh(&table->tb6_lock);
1592 struct ip6rd_flowi {
1594 struct in6_addr gateway;
1597 static struct rt6_info *__ip6_route_redirect(struct net *net,
1598 struct fib6_table *table,
1602 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1603 struct rt6_info *rt;
1604 struct fib6_node *fn;
1607 * Get the "current" route for this destination and
1608 * check if the redirect has come from approriate router.
1610 * RFC 2461 specifies that redirects should only be
1611 * accepted if they come from the nexthop to the target.
1612 * Due to the way the routes are chosen, this notion
1613 * is a bit fuzzy and one might need to check all possible
1617 read_lock_bh(&table->tb6_lock);
1618 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1620 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1622 * Current route is on-link; redirect is always invalid.
1624 * Seems, previous statement is not true. It could
1625 * be node, which looks for us as on-link (f.e. proxy ndisc)
1626 * But then router serving it might decide, that we should
1627 * know truth 8)8) --ANK (980726).
1629 if (rt6_check_expired(rt))
1631 if (!(rt->rt6i_flags & RTF_GATEWAY))
1633 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1635 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1641 rt = net->ipv6.ip6_null_entry;
1642 BACKTRACK(net, &fl6->saddr);
1646 read_unlock_bh(&table->tb6_lock);
1651 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1652 const struct in6_addr *src,
1653 const struct in6_addr *gateway,
1654 struct net_device *dev)
1656 int flags = RT6_LOOKUP_F_HAS_SADDR;
1657 struct net *net = dev_net(dev);
1658 struct ip6rd_flowi rdfl = {
1660 .flowi6_oif = dev->ifindex,
1666 rdfl.gateway = *gateway;
1668 if (rt6_need_strict(dest))
1669 flags |= RT6_LOOKUP_F_IFACE;
1671 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1672 flags, __ip6_route_redirect);
1675 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1676 const struct in6_addr *saddr,
1677 struct neighbour *neigh, u8 *lladdr, int on_link)
1679 struct rt6_info *rt, *nrt = NULL;
1680 struct netevent_redirect netevent;
1681 struct net *net = dev_net(neigh->dev);
1683 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1685 if (rt == net->ipv6.ip6_null_entry) {
1686 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1691 * We have finally decided to accept it.
1694 neigh_update(neigh, lladdr, NUD_STALE,
1695 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1696 NEIGH_UPDATE_F_OVERRIDE|
1697 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1698 NEIGH_UPDATE_F_ISROUTER))
1702 * Redirect received -> path was valid.
1703 * Look, redirects are sent only in response to data packets,
1704 * so that this nexthop apparently is reachable. --ANK
1706 dst_confirm(&rt->dst);
1708 /* Duplicate redirect: silently ignore. */
1709 if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1712 nrt = ip6_rt_copy(rt, dest);
1716 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1718 nrt->rt6i_flags &= ~RTF_GATEWAY;
1720 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1721 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1723 if (ip6_ins_rt(nrt))
1726 netevent.old = &rt->dst;
1727 netevent.new = &nrt->dst;
1728 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1730 if (rt->rt6i_flags & RTF_CACHE) {
1736 dst_release(&rt->dst);
1740 * Misc support functions
1743 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1744 const struct in6_addr *dest)
1746 struct net *net = dev_net(ort->dst.dev);
1747 struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1751 rt->dst.input = ort->dst.input;
1752 rt->dst.output = ort->dst.output;
1753 rt->dst.flags |= DST_HOST;
1755 rt->rt6i_dst.addr = *dest;
1756 rt->rt6i_dst.plen = 128;
1757 dst_copy_metrics(&rt->dst, &ort->dst);
1758 rt->dst.error = ort->dst.error;
1759 rt->rt6i_idev = ort->rt6i_idev;
1761 in6_dev_hold(rt->rt6i_idev);
1762 rt->dst.lastuse = jiffies;
1764 rt->rt6i_gateway = ort->rt6i_gateway;
1765 rt->rt6i_flags = ort->rt6i_flags;
1766 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1767 (RTF_DEFAULT | RTF_ADDRCONF))
1768 rt6_set_from(rt, ort);
1770 rt6_clean_expires(rt);
1771 rt->rt6i_metric = 0;
1773 #ifdef CONFIG_IPV6_SUBTREES
1774 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1776 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1777 rt->rt6i_table = ort->rt6i_table;
1782 #ifdef CONFIG_IPV6_ROUTE_INFO
1783 static struct rt6_info *rt6_get_route_info(struct net *net,
1784 const struct in6_addr *prefix, int prefixlen,
1785 const struct in6_addr *gwaddr, int ifindex)
1787 struct fib6_node *fn;
1788 struct rt6_info *rt = NULL;
1789 struct fib6_table *table;
1791 table = fib6_get_table(net, RT6_TABLE_INFO);
1795 write_lock_bh(&table->tb6_lock);
1796 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1800 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1801 if (rt->dst.dev->ifindex != ifindex)
1803 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1805 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1811 write_unlock_bh(&table->tb6_lock);
1815 static struct rt6_info *rt6_add_route_info(struct net *net,
1816 const struct in6_addr *prefix, int prefixlen,
1817 const struct in6_addr *gwaddr, int ifindex,
1820 struct fib6_config cfg = {
1821 .fc_table = RT6_TABLE_INFO,
1822 .fc_metric = IP6_RT_PRIO_USER,
1823 .fc_ifindex = ifindex,
1824 .fc_dst_len = prefixlen,
1825 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1826 RTF_UP | RTF_PREF(pref),
1828 .fc_nlinfo.nlh = NULL,
1829 .fc_nlinfo.nl_net = net,
1832 cfg.fc_dst = *prefix;
1833 cfg.fc_gateway = *gwaddr;
1835 /* We should treat it as a default route if prefix length is 0. */
1837 cfg.fc_flags |= RTF_DEFAULT;
1839 ip6_route_add(&cfg);
1841 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1845 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1847 struct rt6_info *rt;
1848 struct fib6_table *table;
1850 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1854 write_lock_bh(&table->tb6_lock);
1855 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1856 if (dev == rt->dst.dev &&
1857 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1858 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1863 write_unlock_bh(&table->tb6_lock);
1867 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1868 struct net_device *dev,
1871 struct fib6_config cfg = {
1872 .fc_table = RT6_TABLE_DFLT,
1873 .fc_metric = IP6_RT_PRIO_USER,
1874 .fc_ifindex = dev->ifindex,
1875 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1876 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1878 .fc_nlinfo.nlh = NULL,
1879 .fc_nlinfo.nl_net = dev_net(dev),
1882 cfg.fc_gateway = *gwaddr;
1884 ip6_route_add(&cfg);
1886 return rt6_get_dflt_router(gwaddr, dev);
1889 void rt6_purge_dflt_routers(struct net *net)
1891 struct rt6_info *rt;
1892 struct fib6_table *table;
1894 /* NOTE: Keep consistent with rt6_get_dflt_router */
1895 table = fib6_get_table(net, RT6_TABLE_DFLT);
1900 read_lock_bh(&table->tb6_lock);
1901 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1902 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1904 read_unlock_bh(&table->tb6_lock);
1909 read_unlock_bh(&table->tb6_lock);
1912 static void rtmsg_to_fib6_config(struct net *net,
1913 struct in6_rtmsg *rtmsg,
1914 struct fib6_config *cfg)
1916 memset(cfg, 0, sizeof(*cfg));
1918 cfg->fc_table = RT6_TABLE_MAIN;
1919 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1920 cfg->fc_metric = rtmsg->rtmsg_metric;
1921 cfg->fc_expires = rtmsg->rtmsg_info;
1922 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1923 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1924 cfg->fc_flags = rtmsg->rtmsg_flags;
1926 cfg->fc_nlinfo.nl_net = net;
1928 cfg->fc_dst = rtmsg->rtmsg_dst;
1929 cfg->fc_src = rtmsg->rtmsg_src;
1930 cfg->fc_gateway = rtmsg->rtmsg_gateway;
1933 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1935 struct fib6_config cfg;
1936 struct in6_rtmsg rtmsg;
1940 case SIOCADDRT: /* Add a route */
1941 case SIOCDELRT: /* Delete a route */
1942 if (!capable(CAP_NET_ADMIN))
1944 err = copy_from_user(&rtmsg, arg,
1945 sizeof(struct in6_rtmsg));
1949 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1954 err = ip6_route_add(&cfg);
1957 err = ip6_route_del(&cfg);
1971 * Drop the packet on the floor
1974 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1977 struct dst_entry *dst = skb_dst(skb);
1978 switch (ipstats_mib_noroutes) {
1979 case IPSTATS_MIB_INNOROUTES:
1980 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1981 if (type == IPV6_ADDR_ANY) {
1982 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1983 IPSTATS_MIB_INADDRERRORS);
1987 case IPSTATS_MIB_OUTNOROUTES:
1988 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1989 ipstats_mib_noroutes);
1992 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1997 static int ip6_pkt_discard(struct sk_buff *skb)
1999 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2002 static int ip6_pkt_discard_out(struct sk_buff *skb)
2004 skb->dev = skb_dst(skb)->dev;
2005 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2008 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2010 static int ip6_pkt_prohibit(struct sk_buff *skb)
2012 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2015 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2017 skb->dev = skb_dst(skb)->dev;
2018 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2024 * Allocate a dst for local (unicast / anycast) address.
2027 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2028 const struct in6_addr *addr,
2031 struct net *net = dev_net(idev->dev);
2032 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2036 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2037 return ERR_PTR(-ENOMEM);
2042 rt->dst.flags |= DST_HOST;
2043 rt->dst.input = ip6_input;
2044 rt->dst.output = ip6_output;
2045 rt->rt6i_idev = idev;
2046 rt->dst.obsolete = -1;
2048 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2050 rt->rt6i_flags |= RTF_ANYCAST;
2052 rt->rt6i_flags |= RTF_LOCAL;
2053 err = rt6_bind_neighbour(rt, rt->dst.dev);
2056 return ERR_PTR(err);
2059 rt->rt6i_dst.addr = *addr;
2060 rt->rt6i_dst.plen = 128;
2061 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2063 atomic_set(&rt->dst.__refcnt, 1);
2068 int ip6_route_get_saddr(struct net *net,
2069 struct rt6_info *rt,
2070 const struct in6_addr *daddr,
2072 struct in6_addr *saddr)
2074 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2076 if (rt->rt6i_prefsrc.plen)
2077 *saddr = rt->rt6i_prefsrc.addr;
2079 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2080 daddr, prefs, saddr);
2084 /* remove deleted ip from prefsrc entries */
2085 struct arg_dev_net_ip {
2086 struct net_device *dev;
2088 struct in6_addr *addr;
2091 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2093 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2094 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2095 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2097 if (((void *)rt->dst.dev == dev || !dev) &&
2098 rt != net->ipv6.ip6_null_entry &&
2099 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2100 /* remove prefsrc entry */
2101 rt->rt6i_prefsrc.plen = 0;
2106 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2108 struct net *net = dev_net(ifp->idev->dev);
2109 struct arg_dev_net_ip adni = {
2110 .dev = ifp->idev->dev,
2114 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2117 struct arg_dev_net {
2118 struct net_device *dev;
2122 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2124 const struct arg_dev_net *adn = arg;
2125 const struct net_device *dev = adn->dev;
2127 if ((rt->dst.dev == dev || !dev) &&
2128 rt != adn->net->ipv6.ip6_null_entry)
2134 void rt6_ifdown(struct net *net, struct net_device *dev)
2136 struct arg_dev_net adn = {
2141 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2142 icmp6_clean_all(fib6_ifdown, &adn);
2145 struct rt6_mtu_change_arg {
2146 struct net_device *dev;
2150 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2152 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2153 struct inet6_dev *idev;
2155 /* In IPv6 pmtu discovery is not optional,
2156 so that RTAX_MTU lock cannot disable it.
2157 We still use this lock to block changes
2158 caused by addrconf/ndisc.
2161 idev = __in6_dev_get(arg->dev);
2165 /* For administrative MTU increase, there is no way to discover
2166 IPv6 PMTU increase, so PMTU increase should be updated here.
2167 Since RFC 1981 doesn't include administrative MTU increase
2168 update PMTU increase is a MUST. (i.e. jumbo frame)
2171 If new MTU is less than route PMTU, this new MTU will be the
2172 lowest MTU in the path, update the route PMTU to reflect PMTU
2173 decreases; if new MTU is greater than route PMTU, and the
2174 old MTU is the lowest MTU in the path, update the route PMTU
2175 to reflect the increase. In this case if the other nodes' MTU
2176 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2179 if (rt->dst.dev == arg->dev &&
2180 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2181 (dst_mtu(&rt->dst) >= arg->mtu ||
2182 (dst_mtu(&rt->dst) < arg->mtu &&
2183 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2184 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2189 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2191 struct rt6_mtu_change_arg arg = {
2196 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2199 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2200 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2201 [RTA_OIF] = { .type = NLA_U32 },
2202 [RTA_IIF] = { .type = NLA_U32 },
2203 [RTA_PRIORITY] = { .type = NLA_U32 },
2204 [RTA_METRICS] = { .type = NLA_NESTED },
2207 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2208 struct fib6_config *cfg)
2211 struct nlattr *tb[RTA_MAX+1];
2214 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2219 rtm = nlmsg_data(nlh);
2220 memset(cfg, 0, sizeof(*cfg));
2222 cfg->fc_table = rtm->rtm_table;
2223 cfg->fc_dst_len = rtm->rtm_dst_len;
2224 cfg->fc_src_len = rtm->rtm_src_len;
2225 cfg->fc_flags = RTF_UP;
2226 cfg->fc_protocol = rtm->rtm_protocol;
2228 if (rtm->rtm_type == RTN_UNREACHABLE)
2229 cfg->fc_flags |= RTF_REJECT;
2231 if (rtm->rtm_type == RTN_LOCAL)
2232 cfg->fc_flags |= RTF_LOCAL;
2234 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2235 cfg->fc_nlinfo.nlh = nlh;
2236 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2238 if (tb[RTA_GATEWAY]) {
2239 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2240 cfg->fc_flags |= RTF_GATEWAY;
2244 int plen = (rtm->rtm_dst_len + 7) >> 3;
2246 if (nla_len(tb[RTA_DST]) < plen)
2249 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2253 int plen = (rtm->rtm_src_len + 7) >> 3;
2255 if (nla_len(tb[RTA_SRC]) < plen)
2258 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2261 if (tb[RTA_PREFSRC])
2262 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2265 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2267 if (tb[RTA_PRIORITY])
2268 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2270 if (tb[RTA_METRICS]) {
2271 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2272 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2276 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2283 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2285 struct fib6_config cfg;
2288 err = rtm_to_fib6_config(skb, nlh, &cfg);
2292 return ip6_route_del(&cfg);
2295 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2297 struct fib6_config cfg;
2300 err = rtm_to_fib6_config(skb, nlh, &cfg);
2304 return ip6_route_add(&cfg);
2307 static inline size_t rt6_nlmsg_size(void)
2309 return NLMSG_ALIGN(sizeof(struct rtmsg))
2310 + nla_total_size(16) /* RTA_SRC */
2311 + nla_total_size(16) /* RTA_DST */
2312 + nla_total_size(16) /* RTA_GATEWAY */
2313 + nla_total_size(16) /* RTA_PREFSRC */
2314 + nla_total_size(4) /* RTA_TABLE */
2315 + nla_total_size(4) /* RTA_IIF */
2316 + nla_total_size(4) /* RTA_OIF */
2317 + nla_total_size(4) /* RTA_PRIORITY */
2318 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2319 + nla_total_size(sizeof(struct rta_cacheinfo));
2322 static int rt6_fill_node(struct net *net,
2323 struct sk_buff *skb, struct rt6_info *rt,
2324 struct in6_addr *dst, struct in6_addr *src,
2325 int iif, int type, u32 pid, u32 seq,
2326 int prefix, int nowait, unsigned int flags)
2328 const struct inet_peer *peer;
2330 struct nlmsghdr *nlh;
2333 struct neighbour *n;
2336 if (prefix) { /* user wants prefix routes only */
2337 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2338 /* success since this is not a prefix route */
2343 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2347 rtm = nlmsg_data(nlh);
2348 rtm->rtm_family = AF_INET6;
2349 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2350 rtm->rtm_src_len = rt->rt6i_src.plen;
2353 table = rt->rt6i_table->tb6_id;
2355 table = RT6_TABLE_UNSPEC;
2356 rtm->rtm_table = table;
2357 if (nla_put_u32(skb, RTA_TABLE, table))
2358 goto nla_put_failure;
2359 if (rt->rt6i_flags & RTF_REJECT)
2360 rtm->rtm_type = RTN_UNREACHABLE;
2361 else if (rt->rt6i_flags & RTF_LOCAL)
2362 rtm->rtm_type = RTN_LOCAL;
2363 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2364 rtm->rtm_type = RTN_LOCAL;
2366 rtm->rtm_type = RTN_UNICAST;
2368 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2369 rtm->rtm_protocol = rt->rt6i_protocol;
2370 if (rt->rt6i_flags & RTF_DYNAMIC)
2371 rtm->rtm_protocol = RTPROT_REDIRECT;
2372 else if (rt->rt6i_flags & RTF_ADDRCONF)
2373 rtm->rtm_protocol = RTPROT_KERNEL;
2374 else if (rt->rt6i_flags & RTF_DEFAULT)
2375 rtm->rtm_protocol = RTPROT_RA;
2377 if (rt->rt6i_flags & RTF_CACHE)
2378 rtm->rtm_flags |= RTM_F_CLONED;
2381 if (nla_put(skb, RTA_DST, 16, dst))
2382 goto nla_put_failure;
2383 rtm->rtm_dst_len = 128;
2384 } else if (rtm->rtm_dst_len)
2385 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2386 goto nla_put_failure;
2387 #ifdef CONFIG_IPV6_SUBTREES
2389 if (nla_put(skb, RTA_SRC, 16, src))
2390 goto nla_put_failure;
2391 rtm->rtm_src_len = 128;
2392 } else if (rtm->rtm_src_len &&
2393 nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2394 goto nla_put_failure;
2397 #ifdef CONFIG_IPV6_MROUTE
2398 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2399 int err = ip6mr_get_route(net, skb, rtm, nowait);
2404 goto nla_put_failure;
2406 if (err == -EMSGSIZE)
2407 goto nla_put_failure;
2412 if (nla_put_u32(skb, RTA_IIF, iif))
2413 goto nla_put_failure;
2415 struct in6_addr saddr_buf;
2416 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2417 nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2418 goto nla_put_failure;
2421 if (rt->rt6i_prefsrc.plen) {
2422 struct in6_addr saddr_buf;
2423 saddr_buf = rt->rt6i_prefsrc.addr;
2424 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2425 goto nla_put_failure;
2428 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2429 goto nla_put_failure;
2432 n = dst_get_neighbour_noref(&rt->dst);
2434 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2436 goto nla_put_failure;
2442 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2443 goto nla_put_failure;
2444 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2445 goto nla_put_failure;
2446 if (!(rt->rt6i_flags & RTF_EXPIRES))
2448 else if (rt->dst.expires - jiffies < INT_MAX)
2449 expires = rt->dst.expires - jiffies;
2454 if (rt6_has_peer(rt))
2455 peer = rt6_peer_ptr(rt);
2457 if (peer && peer->tcp_ts_stamp) {
2459 tsage = get_seconds() - peer->tcp_ts_stamp;
2462 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2463 expires, rt->dst.error) < 0)
2464 goto nla_put_failure;
2466 return nlmsg_end(skb, nlh);
2469 nlmsg_cancel(skb, nlh);
2473 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2475 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2478 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2479 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2480 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2484 return rt6_fill_node(arg->net,
2485 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2486 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2487 prefix, 0, NLM_F_MULTI);
2490 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2492 struct net *net = sock_net(in_skb->sk);
2493 struct nlattr *tb[RTA_MAX+1];
2494 struct rt6_info *rt;
2495 struct sk_buff *skb;
2498 int err, iif = 0, oif = 0;
2500 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2505 memset(&fl6, 0, sizeof(fl6));
2508 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2511 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2515 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2518 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2522 iif = nla_get_u32(tb[RTA_IIF]);
2525 oif = nla_get_u32(tb[RTA_OIF]);
2528 struct net_device *dev;
2531 dev = __dev_get_by_index(net, iif);
2537 fl6.flowi6_iif = iif;
2539 if (!ipv6_addr_any(&fl6.saddr))
2540 flags |= RT6_LOOKUP_F_HAS_SADDR;
2542 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2545 fl6.flowi6_oif = oif;
2547 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2550 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2552 dst_release(&rt->dst);
2557 /* Reserve room for dummy headers, this skb can pass
2558 through good chunk of routing engine.
2560 skb_reset_mac_header(skb);
2561 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2563 skb_dst_set(skb, &rt->dst);
2565 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2566 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2567 nlh->nlmsg_seq, 0, 0, 0);
2573 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2578 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2580 struct sk_buff *skb;
2581 struct net *net = info->nl_net;
2586 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2588 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2592 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2593 event, info->pid, seq, 0, 0, 0);
2595 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2596 WARN_ON(err == -EMSGSIZE);
2600 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2601 info->nlh, gfp_any());
2605 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2608 static int ip6_route_dev_notify(struct notifier_block *this,
2609 unsigned long event, void *data)
2611 struct net_device *dev = (struct net_device *)data;
2612 struct net *net = dev_net(dev);
2614 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2615 net->ipv6.ip6_null_entry->dst.dev = dev;
2616 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2617 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2618 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2619 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2620 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2621 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2632 #ifdef CONFIG_PROC_FS
2643 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2645 struct seq_file *m = p_arg;
2646 struct neighbour *n;
2648 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2650 #ifdef CONFIG_IPV6_SUBTREES
2651 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2653 seq_puts(m, "00000000000000000000000000000000 00 ");
2656 n = dst_get_neighbour_noref(&rt->dst);
2658 seq_printf(m, "%pi6", n->primary_key);
2660 seq_puts(m, "00000000000000000000000000000000");
2663 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2664 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2665 rt->dst.__use, rt->rt6i_flags,
2666 rt->dst.dev ? rt->dst.dev->name : "");
2670 static int ipv6_route_show(struct seq_file *m, void *v)
2672 struct net *net = (struct net *)m->private;
2673 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2677 static int ipv6_route_open(struct inode *inode, struct file *file)
2679 return single_open_net(inode, file, ipv6_route_show);
2682 static const struct file_operations ipv6_route_proc_fops = {
2683 .owner = THIS_MODULE,
2684 .open = ipv6_route_open,
2686 .llseek = seq_lseek,
2687 .release = single_release_net,
2690 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2692 struct net *net = (struct net *)seq->private;
2693 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2694 net->ipv6.rt6_stats->fib_nodes,
2695 net->ipv6.rt6_stats->fib_route_nodes,
2696 net->ipv6.rt6_stats->fib_rt_alloc,
2697 net->ipv6.rt6_stats->fib_rt_entries,
2698 net->ipv6.rt6_stats->fib_rt_cache,
2699 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2700 net->ipv6.rt6_stats->fib_discarded_routes);
2705 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2707 return single_open_net(inode, file, rt6_stats_seq_show);
2710 static const struct file_operations rt6_stats_seq_fops = {
2711 .owner = THIS_MODULE,
2712 .open = rt6_stats_seq_open,
2714 .llseek = seq_lseek,
2715 .release = single_release_net,
2717 #endif /* CONFIG_PROC_FS */
2719 #ifdef CONFIG_SYSCTL
2722 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2723 void __user *buffer, size_t *lenp, loff_t *ppos)
2730 net = (struct net *)ctl->extra1;
2731 delay = net->ipv6.sysctl.flush_delay;
2732 proc_dointvec(ctl, write, buffer, lenp, ppos);
2733 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2737 ctl_table ipv6_route_table_template[] = {
2739 .procname = "flush",
2740 .data = &init_net.ipv6.sysctl.flush_delay,
2741 .maxlen = sizeof(int),
2743 .proc_handler = ipv6_sysctl_rtcache_flush
2746 .procname = "gc_thresh",
2747 .data = &ip6_dst_ops_template.gc_thresh,
2748 .maxlen = sizeof(int),
2750 .proc_handler = proc_dointvec,
2753 .procname = "max_size",
2754 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2755 .maxlen = sizeof(int),
2757 .proc_handler = proc_dointvec,
2760 .procname = "gc_min_interval",
2761 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2762 .maxlen = sizeof(int),
2764 .proc_handler = proc_dointvec_jiffies,
2767 .procname = "gc_timeout",
2768 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2769 .maxlen = sizeof(int),
2771 .proc_handler = proc_dointvec_jiffies,
2774 .procname = "gc_interval",
2775 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2776 .maxlen = sizeof(int),
2778 .proc_handler = proc_dointvec_jiffies,
2781 .procname = "gc_elasticity",
2782 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2783 .maxlen = sizeof(int),
2785 .proc_handler = proc_dointvec,
2788 .procname = "mtu_expires",
2789 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2790 .maxlen = sizeof(int),
2792 .proc_handler = proc_dointvec_jiffies,
2795 .procname = "min_adv_mss",
2796 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2797 .maxlen = sizeof(int),
2799 .proc_handler = proc_dointvec,
2802 .procname = "gc_min_interval_ms",
2803 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2804 .maxlen = sizeof(int),
2806 .proc_handler = proc_dointvec_ms_jiffies,
2811 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2813 struct ctl_table *table;
2815 table = kmemdup(ipv6_route_table_template,
2816 sizeof(ipv6_route_table_template),
2820 table[0].data = &net->ipv6.sysctl.flush_delay;
2821 table[0].extra1 = net;
2822 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2823 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2824 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2825 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2826 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2827 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2828 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2829 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2830 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2837 static int __net_init ip6_route_net_init(struct net *net)
2841 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2842 sizeof(net->ipv6.ip6_dst_ops));
2844 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2845 goto out_ip6_dst_ops;
2847 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2848 sizeof(*net->ipv6.ip6_null_entry),
2850 if (!net->ipv6.ip6_null_entry)
2851 goto out_ip6_dst_entries;
2852 net->ipv6.ip6_null_entry->dst.path =
2853 (struct dst_entry *)net->ipv6.ip6_null_entry;
2854 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2855 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2856 ip6_template_metrics, true);
2858 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2859 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2860 sizeof(*net->ipv6.ip6_prohibit_entry),
2862 if (!net->ipv6.ip6_prohibit_entry)
2863 goto out_ip6_null_entry;
2864 net->ipv6.ip6_prohibit_entry->dst.path =
2865 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2866 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2867 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2868 ip6_template_metrics, true);
2870 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2871 sizeof(*net->ipv6.ip6_blk_hole_entry),
2873 if (!net->ipv6.ip6_blk_hole_entry)
2874 goto out_ip6_prohibit_entry;
2875 net->ipv6.ip6_blk_hole_entry->dst.path =
2876 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2877 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2878 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2879 ip6_template_metrics, true);
2882 net->ipv6.sysctl.flush_delay = 0;
2883 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2884 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2885 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2886 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2887 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2888 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2889 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2891 #ifdef CONFIG_PROC_FS
2892 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2893 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2895 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2901 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2902 out_ip6_prohibit_entry:
2903 kfree(net->ipv6.ip6_prohibit_entry);
2905 kfree(net->ipv6.ip6_null_entry);
2907 out_ip6_dst_entries:
2908 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2913 static void __net_exit ip6_route_net_exit(struct net *net)
2915 #ifdef CONFIG_PROC_FS
2916 proc_net_remove(net, "ipv6_route");
2917 proc_net_remove(net, "rt6_stats");
2919 kfree(net->ipv6.ip6_null_entry);
2920 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2921 kfree(net->ipv6.ip6_prohibit_entry);
2922 kfree(net->ipv6.ip6_blk_hole_entry);
2924 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2927 static struct pernet_operations ip6_route_net_ops = {
2928 .init = ip6_route_net_init,
2929 .exit = ip6_route_net_exit,
2932 static int __net_init ipv6_inetpeer_init(struct net *net)
2934 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2938 inet_peer_base_init(bp);
2939 net->ipv6.peers = bp;
2943 static void __net_exit ipv6_inetpeer_exit(struct net *net)
2945 struct inet_peer_base *bp = net->ipv6.peers;
2947 net->ipv6.peers = NULL;
2948 inetpeer_invalidate_tree(bp);
2952 static struct pernet_operations ipv6_inetpeer_ops = {
2953 .init = ipv6_inetpeer_init,
2954 .exit = ipv6_inetpeer_exit,
2957 static struct notifier_block ip6_route_dev_notifier = {
2958 .notifier_call = ip6_route_dev_notify,
2962 int __init ip6_route_init(void)
2967 ip6_dst_ops_template.kmem_cachep =
2968 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2969 SLAB_HWCACHE_ALIGN, NULL);
2970 if (!ip6_dst_ops_template.kmem_cachep)
2973 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2975 goto out_kmem_cache;
2977 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
2979 goto out_dst_entries;
2981 ret = register_pernet_subsys(&ip6_route_net_ops);
2983 goto out_register_inetpeer;
2985 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2987 /* Registering of the loopback is done before this portion of code,
2988 * the loopback reference in rt6_info will not be taken, do it
2989 * manually for init_net */
2990 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2991 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2992 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2993 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2994 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2995 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2996 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3000 goto out_register_subsys;
3006 ret = fib6_rules_init();
3011 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3012 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3013 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3014 goto fib6_rules_init;
3016 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3018 goto fib6_rules_init;
3024 fib6_rules_cleanup();
3029 out_register_subsys:
3030 unregister_pernet_subsys(&ip6_route_net_ops);
3031 out_register_inetpeer:
3032 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3034 dst_entries_destroy(&ip6_dst_blackhole_ops);
3036 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3040 void ip6_route_cleanup(void)
3042 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3043 fib6_rules_cleanup();
3046 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3047 unregister_pernet_subsys(&ip6_route_net_ops);
3048 dst_entries_destroy(&ip6_dst_blackhole_ops);
3049 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);