2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
61 #include <asm/uaccess.h>
64 #include <linux/sysctl.h>
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68 const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void ip6_dst_destroy(struct dst_entry *);
74 static void ip6_dst_ifdown(struct dst_entry *,
75 struct net_device *dev, int how);
76 static int ip6_dst_gc(struct dst_ops *ops);
78 static int ip6_pkt_discard(struct sk_buff *skb);
79 static int ip6_pkt_discard_out(struct sk_buff *skb);
80 static void ip6_link_failure(struct sk_buff *skb);
81 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85 const struct in6_addr *prefix, int prefixlen,
86 const struct in6_addr *gwaddr, int ifindex,
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89 const struct in6_addr *prefix, int prefixlen,
90 const struct in6_addr *gwaddr, int ifindex);
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
95 struct rt6_info *rt = (struct rt6_info *) dst;
96 struct inet_peer *peer;
99 if (!(rt->dst.flags & DST_HOST))
102 peer = rt6_get_peer_create(rt);
104 u32 *old_p = __DST_METRICS_PTR(old);
105 unsigned long prev, new;
108 if (inet_metrics_new(peer))
109 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111 new = (unsigned long) p;
112 prev = cmpxchg(&dst->_metrics, old, new);
115 p = __DST_METRICS_PTR(prev);
116 if (prev & DST_METRICS_READ_ONLY)
123 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
127 struct in6_addr *p = &rt->rt6i_gateway;
129 if (!ipv6_addr_any(p))
130 return (const void *) p;
132 return &ipv6_hdr(skb)->daddr;
136 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
140 struct rt6_info *rt = (struct rt6_info *) dst;
143 daddr = choose_neigh_daddr(rt, skb, daddr);
144 n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
147 return neigh_create(&nd_tbl, daddr, dst->dev);
150 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
152 struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
154 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
163 static struct dst_ops ip6_dst_ops_template = {
165 .protocol = cpu_to_be16(ETH_P_IPV6),
168 .check = ip6_dst_check,
169 .default_advmss = ip6_default_advmss,
171 .cow_metrics = ipv6_cow_metrics,
172 .destroy = ip6_dst_destroy,
173 .ifdown = ip6_dst_ifdown,
174 .negative_advice = ip6_negative_advice,
175 .link_failure = ip6_link_failure,
176 .update_pmtu = ip6_rt_update_pmtu,
177 .local_out = __ip6_local_out,
178 .neigh_lookup = ip6_neigh_lookup,
181 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
183 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
185 return mtu ? : dst->dev->mtu;
188 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
192 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
198 static struct dst_ops ip6_dst_blackhole_ops = {
200 .protocol = cpu_to_be16(ETH_P_IPV6),
201 .destroy = ip6_dst_destroy,
202 .check = ip6_dst_check,
203 .mtu = ip6_blackhole_mtu,
204 .default_advmss = ip6_default_advmss,
205 .update_pmtu = ip6_rt_blackhole_update_pmtu,
206 .cow_metrics = ip6_rt_blackhole_cow_metrics,
207 .neigh_lookup = ip6_neigh_lookup,
210 static const u32 ip6_template_metrics[RTAX_MAX] = {
211 [RTAX_HOPLIMIT - 1] = 255,
214 static struct rt6_info ip6_null_entry_template = {
216 .__refcnt = ATOMIC_INIT(1),
219 .error = -ENETUNREACH,
220 .input = ip6_pkt_discard,
221 .output = ip6_pkt_discard_out,
223 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
224 .rt6i_protocol = RTPROT_KERNEL,
225 .rt6i_metric = ~(u32) 0,
226 .rt6i_ref = ATOMIC_INIT(1),
229 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
231 static int ip6_pkt_prohibit(struct sk_buff *skb);
232 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
234 static struct rt6_info ip6_prohibit_entry_template = {
236 .__refcnt = ATOMIC_INIT(1),
240 .input = ip6_pkt_prohibit,
241 .output = ip6_pkt_prohibit_out,
243 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
244 .rt6i_protocol = RTPROT_KERNEL,
245 .rt6i_metric = ~(u32) 0,
246 .rt6i_ref = ATOMIC_INIT(1),
249 static struct rt6_info ip6_blk_hole_entry_template = {
251 .__refcnt = ATOMIC_INIT(1),
255 .input = dst_discard,
256 .output = dst_discard,
258 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
259 .rt6i_protocol = RTPROT_KERNEL,
260 .rt6i_metric = ~(u32) 0,
261 .rt6i_ref = ATOMIC_INIT(1),
266 /* allocate dst with ip6_dst_ops */
267 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
268 struct net_device *dev,
270 struct fib6_table *table)
272 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
277 sizeof(*rt) - sizeof(struct dst_entry));
278 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
283 static void ip6_dst_destroy(struct dst_entry *dst)
285 struct rt6_info *rt = (struct rt6_info *)dst;
286 struct inet6_dev *idev = rt->rt6i_idev;
289 neigh_release(rt->n);
291 if (!(rt->dst.flags & DST_HOST))
292 dst_destroy_metrics_generic(dst);
295 rt->rt6i_idev = NULL;
299 if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
300 dst_release(dst->from);
302 if (rt6_has_peer(rt)) {
303 struct inet_peer *peer = rt6_peer_ptr(rt);
308 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
310 static u32 rt6_peer_genid(void)
312 return atomic_read(&__rt6_peer_genid);
315 void rt6_bind_peer(struct rt6_info *rt, int create)
317 struct inet_peer_base *base;
318 struct inet_peer *peer;
320 base = inetpeer_base_ptr(rt->_rt6i_peer);
324 peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
326 if (!rt6_set_peer(rt, peer))
329 rt->rt6i_peer_genid = rt6_peer_genid();
333 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
336 struct rt6_info *rt = (struct rt6_info *)dst;
337 struct inet6_dev *idev = rt->rt6i_idev;
338 struct net_device *loopback_dev =
339 dev_net(dev)->loopback_dev;
341 if (dev != loopback_dev) {
342 if (idev && idev->dev == dev) {
343 struct inet6_dev *loopback_idev =
344 in6_dev_get(loopback_dev);
346 rt->rt6i_idev = loopback_idev;
350 if (rt->n && rt->n->dev == dev) {
351 rt->n->dev = loopback_dev;
352 dev_hold(loopback_dev);
358 static bool rt6_check_expired(const struct rt6_info *rt)
360 struct rt6_info *ort = NULL;
362 if (rt->rt6i_flags & RTF_EXPIRES) {
363 if (time_after(jiffies, rt->dst.expires))
365 } else if (rt->dst.from) {
366 ort = (struct rt6_info *) rt->dst.from;
367 return (ort->rt6i_flags & RTF_EXPIRES) &&
368 time_after(jiffies, ort->dst.expires);
373 static bool rt6_need_strict(const struct in6_addr *daddr)
375 return ipv6_addr_type(daddr) &
376 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
380 * Route lookup. Any table->tb6_lock is implied.
383 static inline struct rt6_info *rt6_device_match(struct net *net,
385 const struct in6_addr *saddr,
389 struct rt6_info *local = NULL;
390 struct rt6_info *sprt;
392 if (!oif && ipv6_addr_any(saddr))
395 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
396 struct net_device *dev = sprt->dst.dev;
399 if (dev->ifindex == oif)
401 if (dev->flags & IFF_LOOPBACK) {
402 if (!sprt->rt6i_idev ||
403 sprt->rt6i_idev->dev->ifindex != oif) {
404 if (flags & RT6_LOOKUP_F_IFACE && oif)
406 if (local && (!oif ||
407 local->rt6i_idev->dev->ifindex == oif))
413 if (ipv6_chk_addr(net, saddr, dev,
414 flags & RT6_LOOKUP_F_IFACE))
423 if (flags & RT6_LOOKUP_F_IFACE)
424 return net->ipv6.ip6_null_entry;
430 #ifdef CONFIG_IPV6_ROUTER_PREF
431 static void rt6_probe(struct rt6_info *rt)
433 struct neighbour *neigh;
435 * Okay, this does not seem to be appropriate
436 * for now, however, we need to check if it
437 * is really so; aka Router Reachability Probing.
439 * Router Reachability Probe MUST be rate-limited
440 * to no more than one per minute.
443 neigh = rt ? rt->n : NULL;
444 if (!neigh || (neigh->nud_state & NUD_VALID))
446 read_lock_bh(&neigh->lock);
447 if (!(neigh->nud_state & NUD_VALID) &&
448 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
449 struct in6_addr mcaddr;
450 struct in6_addr *target;
452 neigh->updated = jiffies;
453 read_unlock_bh(&neigh->lock);
455 target = (struct in6_addr *)&neigh->primary_key;
456 addrconf_addr_solict_mult(target, &mcaddr);
457 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
459 read_unlock_bh(&neigh->lock);
465 static inline void rt6_probe(struct rt6_info *rt)
471 * Default Router Selection (RFC 2461 6.3.6)
473 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
475 struct net_device *dev = rt->dst.dev;
476 if (!oif || dev->ifindex == oif)
478 if ((dev->flags & IFF_LOOPBACK) &&
479 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
484 static inline int rt6_check_neigh(struct rt6_info *rt)
486 struct neighbour *neigh;
491 if (rt->rt6i_flags & RTF_NONEXTHOP ||
492 !(rt->rt6i_flags & RTF_GATEWAY))
495 read_lock_bh(&neigh->lock);
496 if (neigh->nud_state & NUD_VALID)
498 #ifdef CONFIG_IPV6_ROUTER_PREF
499 else if (neigh->nud_state & NUD_FAILED)
504 read_unlock_bh(&neigh->lock);
511 static int rt6_score_route(struct rt6_info *rt, int oif,
516 m = rt6_check_dev(rt, oif);
517 if (!m && (strict & RT6_LOOKUP_F_IFACE))
519 #ifdef CONFIG_IPV6_ROUTER_PREF
520 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
522 n = rt6_check_neigh(rt);
523 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
528 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
529 int *mpri, struct rt6_info *match)
533 if (rt6_check_expired(rt))
536 m = rt6_score_route(rt, oif, strict);
541 if (strict & RT6_LOOKUP_F_REACHABLE)
545 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
553 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
554 struct rt6_info *rr_head,
555 u32 metric, int oif, int strict)
557 struct rt6_info *rt, *match;
561 for (rt = rr_head; rt && rt->rt6i_metric == metric;
562 rt = rt->dst.rt6_next)
563 match = find_match(rt, oif, strict, &mpri, match);
564 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
565 rt = rt->dst.rt6_next)
566 match = find_match(rt, oif, strict, &mpri, match);
571 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
573 struct rt6_info *match, *rt0;
578 fn->rr_ptr = rt0 = fn->leaf;
580 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
583 (strict & RT6_LOOKUP_F_REACHABLE)) {
584 struct rt6_info *next = rt0->dst.rt6_next;
586 /* no entries matched; do round-robin */
587 if (!next || next->rt6i_metric != rt0->rt6i_metric)
594 net = dev_net(rt0->dst.dev);
595 return match ? match : net->ipv6.ip6_null_entry;
598 #ifdef CONFIG_IPV6_ROUTE_INFO
599 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
600 const struct in6_addr *gwaddr)
602 struct net *net = dev_net(dev);
603 struct route_info *rinfo = (struct route_info *) opt;
604 struct in6_addr prefix_buf, *prefix;
606 unsigned long lifetime;
609 if (len < sizeof(struct route_info)) {
613 /* Sanity check for prefix_len and length */
614 if (rinfo->length > 3) {
616 } else if (rinfo->prefix_len > 128) {
618 } else if (rinfo->prefix_len > 64) {
619 if (rinfo->length < 2) {
622 } else if (rinfo->prefix_len > 0) {
623 if (rinfo->length < 1) {
628 pref = rinfo->route_pref;
629 if (pref == ICMPV6_ROUTER_PREF_INVALID)
632 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
634 if (rinfo->length == 3)
635 prefix = (struct in6_addr *)rinfo->prefix;
637 /* this function is safe */
638 ipv6_addr_prefix(&prefix_buf,
639 (struct in6_addr *)rinfo->prefix,
641 prefix = &prefix_buf;
644 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
647 if (rt && !lifetime) {
653 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
656 rt->rt6i_flags = RTF_ROUTEINFO |
657 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
660 if (!addrconf_finite_timeout(lifetime))
661 rt6_clean_expires(rt);
663 rt6_set_expires(rt, jiffies + HZ * lifetime);
665 dst_release(&rt->dst);
671 #define BACKTRACK(__net, saddr) \
673 if (rt == __net->ipv6.ip6_null_entry) { \
674 struct fib6_node *pn; \
676 if (fn->fn_flags & RTN_TL_ROOT) \
679 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
680 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
683 if (fn->fn_flags & RTN_RTINFO) \
689 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
690 struct fib6_table *table,
691 struct flowi6 *fl6, int flags)
693 struct fib6_node *fn;
696 read_lock_bh(&table->tb6_lock);
697 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
700 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
701 BACKTRACK(net, &fl6->saddr);
703 dst_use(&rt->dst, jiffies);
704 read_unlock_bh(&table->tb6_lock);
709 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
712 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
714 EXPORT_SYMBOL_GPL(ip6_route_lookup);
716 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
717 const struct in6_addr *saddr, int oif, int strict)
719 struct flowi6 fl6 = {
723 struct dst_entry *dst;
724 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
727 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
728 flags |= RT6_LOOKUP_F_HAS_SADDR;
731 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
733 return (struct rt6_info *) dst;
740 EXPORT_SYMBOL(rt6_lookup);
742 /* ip6_ins_rt is called with FREE table->tb6_lock.
743 It takes new route entry, the addition fails by any reason the
744 route is freed. In any case, if caller does not hold it, it may
748 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
751 struct fib6_table *table;
753 table = rt->rt6i_table;
754 write_lock_bh(&table->tb6_lock);
755 err = fib6_add(&table->tb6_root, rt, info);
756 write_unlock_bh(&table->tb6_lock);
761 int ip6_ins_rt(struct rt6_info *rt)
763 struct nl_info info = {
764 .nl_net = dev_net(rt->dst.dev),
766 return __ip6_ins_rt(rt, &info);
769 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
770 const struct in6_addr *daddr,
771 const struct in6_addr *saddr)
779 rt = ip6_rt_copy(ort, daddr);
782 int attempts = !in_softirq();
784 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
785 if (ort->rt6i_dst.plen != 128 &&
786 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
787 rt->rt6i_flags |= RTF_ANYCAST;
788 rt->rt6i_gateway = *daddr;
791 rt->rt6i_flags |= RTF_CACHE;
793 #ifdef CONFIG_IPV6_SUBTREES
794 if (rt->rt6i_src.plen && saddr) {
795 rt->rt6i_src.addr = *saddr;
796 rt->rt6i_src.plen = 128;
801 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
802 struct net *net = dev_net(rt->dst.dev);
803 int saved_rt_min_interval =
804 net->ipv6.sysctl.ip6_rt_gc_min_interval;
805 int saved_rt_elasticity =
806 net->ipv6.sysctl.ip6_rt_gc_elasticity;
808 if (attempts-- > 0) {
809 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
810 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
812 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
814 net->ipv6.sysctl.ip6_rt_gc_elasticity =
816 net->ipv6.sysctl.ip6_rt_gc_min_interval =
817 saved_rt_min_interval;
821 net_warn_ratelimited("Neighbour table overflow\n");
830 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
831 const struct in6_addr *daddr)
833 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
836 rt->rt6i_flags |= RTF_CACHE;
837 rt->n = neigh_clone(ort->n);
842 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
843 struct flowi6 *fl6, int flags)
845 struct fib6_node *fn;
846 struct rt6_info *rt, *nrt;
850 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
852 strict |= flags & RT6_LOOKUP_F_IFACE;
855 read_lock_bh(&table->tb6_lock);
858 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
861 rt = rt6_select(fn, oif, strict | reachable);
863 BACKTRACK(net, &fl6->saddr);
864 if (rt == net->ipv6.ip6_null_entry ||
865 rt->rt6i_flags & RTF_CACHE)
869 read_unlock_bh(&table->tb6_lock);
871 if (!rt->n && !(rt->rt6i_flags & RTF_NONEXTHOP))
872 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
873 else if (!(rt->dst.flags & DST_HOST))
874 nrt = rt6_alloc_clone(rt, &fl6->daddr);
878 dst_release(&rt->dst);
879 rt = nrt ? : net->ipv6.ip6_null_entry;
883 err = ip6_ins_rt(nrt);
892 * Race condition! In the gap, when table->tb6_lock was
893 * released someone could insert this route. Relookup.
895 dst_release(&rt->dst);
904 read_unlock_bh(&table->tb6_lock);
906 rt->dst.lastuse = jiffies;
912 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
913 struct flowi6 *fl6, int flags)
915 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
918 static struct dst_entry *ip6_route_input_lookup(struct net *net,
919 struct net_device *dev,
920 struct flowi6 *fl6, int flags)
922 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
923 flags |= RT6_LOOKUP_F_IFACE;
925 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
928 void ip6_route_input(struct sk_buff *skb)
930 const struct ipv6hdr *iph = ipv6_hdr(skb);
931 struct net *net = dev_net(skb->dev);
932 int flags = RT6_LOOKUP_F_HAS_SADDR;
933 struct flowi6 fl6 = {
934 .flowi6_iif = skb->dev->ifindex,
937 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
938 .flowi6_mark = skb->mark,
939 .flowi6_proto = iph->nexthdr,
942 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
945 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
946 struct flowi6 *fl6, int flags)
948 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
951 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
956 fl6->flowi6_iif = net->loopback_dev->ifindex;
958 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
959 flags |= RT6_LOOKUP_F_IFACE;
961 if (!ipv6_addr_any(&fl6->saddr))
962 flags |= RT6_LOOKUP_F_HAS_SADDR;
964 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
966 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
969 EXPORT_SYMBOL(ip6_route_output);
971 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
973 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
974 struct dst_entry *new = NULL;
976 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
978 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
979 rt6_init_peer(rt, net->ipv6.peers);
984 new->input = dst_discard;
985 new->output = dst_discard;
987 if (dst_metrics_read_only(&ort->dst))
988 new->_metrics = ort->dst._metrics;
990 dst_copy_metrics(new, &ort->dst);
991 rt->rt6i_idev = ort->rt6i_idev;
993 in6_dev_hold(rt->rt6i_idev);
995 rt->rt6i_gateway = ort->rt6i_gateway;
996 rt->rt6i_flags = ort->rt6i_flags;
997 rt6_clean_expires(rt);
1000 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1001 #ifdef CONFIG_IPV6_SUBTREES
1002 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1008 dst_release(dst_orig);
1009 return new ? new : ERR_PTR(-ENOMEM);
1013 * Destination cache support functions
1016 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1018 struct rt6_info *rt;
1020 rt = (struct rt6_info *) dst;
1022 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1023 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1024 if (!rt6_has_peer(rt))
1025 rt6_bind_peer(rt, 0);
1026 rt->rt6i_peer_genid = rt6_peer_genid();
1033 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1035 struct rt6_info *rt = (struct rt6_info *) dst;
1038 if (rt->rt6i_flags & RTF_CACHE) {
1039 if (rt6_check_expired(rt)) {
1051 static void ip6_link_failure(struct sk_buff *skb)
1053 struct rt6_info *rt;
1055 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1057 rt = (struct rt6_info *) skb_dst(skb);
1059 if (rt->rt6i_flags & RTF_CACHE)
1060 rt6_update_expires(rt, 0);
1061 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1062 rt->rt6i_node->fn_sernum = -1;
1066 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1068 struct rt6_info *rt6 = (struct rt6_info*)dst;
1071 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1072 struct net *net = dev_net(dst->dev);
1074 rt6->rt6i_flags |= RTF_MODIFIED;
1075 if (mtu < IPV6_MIN_MTU) {
1076 u32 features = dst_metric(dst, RTAX_FEATURES);
1078 features |= RTAX_FEATURE_ALLFRAG;
1079 dst_metric_set(dst, RTAX_FEATURES, features);
1081 dst_metric_set(dst, RTAX_MTU, mtu);
1082 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1086 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1089 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1090 struct dst_entry *dst;
1093 memset(&fl6, 0, sizeof(fl6));
1094 fl6.flowi6_oif = oif;
1095 fl6.flowi6_mark = mark;
1096 fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS;
1097 fl6.daddr = iph->daddr;
1098 fl6.saddr = iph->saddr;
1099 fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1101 dst = ip6_route_output(net, NULL, &fl6);
1103 ip6_rt_update_pmtu(dst, ntohl(mtu));
1106 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1108 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1110 ip6_update_pmtu(skb, sock_net(sk), mtu,
1111 sk->sk_bound_dev_if, sk->sk_mark);
1113 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1115 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1117 struct net_device *dev = dst->dev;
1118 unsigned int mtu = dst_mtu(dst);
1119 struct net *net = dev_net(dev);
1121 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1123 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1124 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1127 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1128 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1129 * IPV6_MAXPLEN is also valid and means: "any MSS,
1130 * rely only on pmtu discovery"
1132 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1137 static unsigned int ip6_mtu(const struct dst_entry *dst)
1139 struct inet6_dev *idev;
1140 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1148 idev = __in6_dev_get(dst->dev);
1150 mtu = idev->cnf.mtu6;
1156 static struct dst_entry *icmp6_dst_gc_list;
1157 static DEFINE_SPINLOCK(icmp6_dst_lock);
1159 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1160 struct neighbour *neigh,
1163 struct dst_entry *dst;
1164 struct rt6_info *rt;
1165 struct inet6_dev *idev = in6_dev_get(dev);
1166 struct net *net = dev_net(dev);
1168 if (unlikely(!idev))
1169 return ERR_PTR(-ENODEV);
1171 rt = ip6_dst_alloc(net, dev, 0, NULL);
1172 if (unlikely(!rt)) {
1174 dst = ERR_PTR(-ENOMEM);
1181 neigh = ip6_neigh_lookup(&rt->dst, NULL, &fl6->daddr);
1182 if (IS_ERR(neigh)) {
1185 return ERR_CAST(neigh);
1189 rt->dst.flags |= DST_HOST;
1190 rt->dst.output = ip6_output;
1192 atomic_set(&rt->dst.__refcnt, 1);
1193 rt->rt6i_dst.addr = fl6->daddr;
1194 rt->rt6i_dst.plen = 128;
1195 rt->rt6i_idev = idev;
1196 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1198 spin_lock_bh(&icmp6_dst_lock);
1199 rt->dst.next = icmp6_dst_gc_list;
1200 icmp6_dst_gc_list = &rt->dst;
1201 spin_unlock_bh(&icmp6_dst_lock);
1203 fib6_force_start_gc(net);
1205 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1211 int icmp6_dst_gc(void)
1213 struct dst_entry *dst, **pprev;
1216 spin_lock_bh(&icmp6_dst_lock);
1217 pprev = &icmp6_dst_gc_list;
1219 while ((dst = *pprev) != NULL) {
1220 if (!atomic_read(&dst->__refcnt)) {
1229 spin_unlock_bh(&icmp6_dst_lock);
1234 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1237 struct dst_entry *dst, **pprev;
1239 spin_lock_bh(&icmp6_dst_lock);
1240 pprev = &icmp6_dst_gc_list;
1241 while ((dst = *pprev) != NULL) {
1242 struct rt6_info *rt = (struct rt6_info *) dst;
1243 if (func(rt, arg)) {
1250 spin_unlock_bh(&icmp6_dst_lock);
1253 static int ip6_dst_gc(struct dst_ops *ops)
1255 unsigned long now = jiffies;
1256 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1257 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1258 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1259 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1260 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1261 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1264 entries = dst_entries_get_fast(ops);
1265 if (time_after(rt_last_gc + rt_min_interval, now) &&
1266 entries <= rt_max_size)
1269 net->ipv6.ip6_rt_gc_expire++;
1270 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1271 net->ipv6.ip6_rt_last_gc = now;
1272 entries = dst_entries_get_slow(ops);
1273 if (entries < ops->gc_thresh)
1274 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1276 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1277 return entries > rt_max_size;
1280 /* Clean host part of a prefix. Not necessary in radix tree,
1281 but results in cleaner routing tables.
1283 Remove it only when all the things will work!
1286 int ip6_dst_hoplimit(struct dst_entry *dst)
1288 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1289 if (hoplimit == 0) {
1290 struct net_device *dev = dst->dev;
1291 struct inet6_dev *idev;
1294 idev = __in6_dev_get(dev);
1296 hoplimit = idev->cnf.hop_limit;
1298 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1303 EXPORT_SYMBOL(ip6_dst_hoplimit);
1309 int ip6_route_add(struct fib6_config *cfg)
1312 struct net *net = cfg->fc_nlinfo.nl_net;
1313 struct rt6_info *rt = NULL;
1314 struct net_device *dev = NULL;
1315 struct inet6_dev *idev = NULL;
1316 struct fib6_table *table;
1319 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1321 #ifndef CONFIG_IPV6_SUBTREES
1322 if (cfg->fc_src_len)
1325 if (cfg->fc_ifindex) {
1327 dev = dev_get_by_index(net, cfg->fc_ifindex);
1330 idev = in6_dev_get(dev);
1335 if (cfg->fc_metric == 0)
1336 cfg->fc_metric = IP6_RT_PRIO_USER;
1339 if (cfg->fc_nlinfo.nlh &&
1340 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1341 table = fib6_get_table(net, cfg->fc_table);
1343 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1344 table = fib6_new_table(net, cfg->fc_table);
1347 table = fib6_new_table(net, cfg->fc_table);
1353 rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1360 rt->dst.obsolete = -1;
1362 if (cfg->fc_flags & RTF_EXPIRES)
1363 rt6_set_expires(rt, jiffies +
1364 clock_t_to_jiffies(cfg->fc_expires));
1366 rt6_clean_expires(rt);
1368 if (cfg->fc_protocol == RTPROT_UNSPEC)
1369 cfg->fc_protocol = RTPROT_BOOT;
1370 rt->rt6i_protocol = cfg->fc_protocol;
1372 addr_type = ipv6_addr_type(&cfg->fc_dst);
1374 if (addr_type & IPV6_ADDR_MULTICAST)
1375 rt->dst.input = ip6_mc_input;
1376 else if (cfg->fc_flags & RTF_LOCAL)
1377 rt->dst.input = ip6_input;
1379 rt->dst.input = ip6_forward;
1381 rt->dst.output = ip6_output;
1383 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1384 rt->rt6i_dst.plen = cfg->fc_dst_len;
1385 if (rt->rt6i_dst.plen == 128)
1386 rt->dst.flags |= DST_HOST;
1388 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1389 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1394 dst_init_metrics(&rt->dst, metrics, 0);
1396 #ifdef CONFIG_IPV6_SUBTREES
1397 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1398 rt->rt6i_src.plen = cfg->fc_src_len;
1401 rt->rt6i_metric = cfg->fc_metric;
1403 /* We cannot add true routes via loopback here,
1404 they would result in kernel looping; promote them to reject routes
1406 if ((cfg->fc_flags & RTF_REJECT) ||
1407 (dev && (dev->flags & IFF_LOOPBACK) &&
1408 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1409 !(cfg->fc_flags & RTF_LOCAL))) {
1410 /* hold loopback dev/idev if we haven't done so. */
1411 if (dev != net->loopback_dev) {
1416 dev = net->loopback_dev;
1418 idev = in6_dev_get(dev);
1424 rt->dst.output = ip6_pkt_discard_out;
1425 rt->dst.input = ip6_pkt_discard;
1426 rt->dst.error = -ENETUNREACH;
1427 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1431 if (cfg->fc_flags & RTF_GATEWAY) {
1432 const struct in6_addr *gw_addr;
1435 gw_addr = &cfg->fc_gateway;
1436 rt->rt6i_gateway = *gw_addr;
1437 gwa_type = ipv6_addr_type(gw_addr);
1439 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1440 struct rt6_info *grt;
1442 /* IPv6 strictly inhibits using not link-local
1443 addresses as nexthop address.
1444 Otherwise, router will not able to send redirects.
1445 It is very good, but in some (rare!) circumstances
1446 (SIT, PtP, NBMA NOARP links) it is handy to allow
1447 some exceptions. --ANK
1450 if (!(gwa_type & IPV6_ADDR_UNICAST))
1453 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1455 err = -EHOSTUNREACH;
1459 if (dev != grt->dst.dev) {
1460 dst_release(&grt->dst);
1465 idev = grt->rt6i_idev;
1467 in6_dev_hold(grt->rt6i_idev);
1469 if (!(grt->rt6i_flags & RTF_GATEWAY))
1471 dst_release(&grt->dst);
1477 if (!dev || (dev->flags & IFF_LOOPBACK))
1485 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1486 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1490 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1491 rt->rt6i_prefsrc.plen = 128;
1493 rt->rt6i_prefsrc.plen = 0;
1495 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1496 err = rt6_bind_neighbour(rt, dev);
1501 rt->rt6i_flags = cfg->fc_flags;
1508 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1509 int type = nla_type(nla);
1512 if (type > RTAX_MAX) {
1517 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1523 rt->rt6i_idev = idev;
1524 rt->rt6i_table = table;
1526 cfg->fc_nlinfo.nl_net = dev_net(dev);
1528 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1540 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1543 struct fib6_table *table;
1544 struct net *net = dev_net(rt->dst.dev);
1546 if (rt == net->ipv6.ip6_null_entry)
1549 table = rt->rt6i_table;
1550 write_lock_bh(&table->tb6_lock);
1552 err = fib6_del(rt, info);
1553 dst_release(&rt->dst);
1555 write_unlock_bh(&table->tb6_lock);
1560 int ip6_del_rt(struct rt6_info *rt)
1562 struct nl_info info = {
1563 .nl_net = dev_net(rt->dst.dev),
1565 return __ip6_del_rt(rt, &info);
1568 static int ip6_route_del(struct fib6_config *cfg)
1570 struct fib6_table *table;
1571 struct fib6_node *fn;
1572 struct rt6_info *rt;
1575 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1579 read_lock_bh(&table->tb6_lock);
1581 fn = fib6_locate(&table->tb6_root,
1582 &cfg->fc_dst, cfg->fc_dst_len,
1583 &cfg->fc_src, cfg->fc_src_len);
1586 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1587 if (cfg->fc_ifindex &&
1589 rt->dst.dev->ifindex != cfg->fc_ifindex))
1591 if (cfg->fc_flags & RTF_GATEWAY &&
1592 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1594 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1597 read_unlock_bh(&table->tb6_lock);
1599 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1602 read_unlock_bh(&table->tb6_lock);
1610 struct ip6rd_flowi {
1612 struct in6_addr gateway;
1615 static struct rt6_info *__ip6_route_redirect(struct net *net,
1616 struct fib6_table *table,
1620 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1621 struct rt6_info *rt;
1622 struct fib6_node *fn;
1625 * Get the "current" route for this destination and
1626 * check if the redirect has come from approriate router.
1628 * RFC 2461 specifies that redirects should only be
1629 * accepted if they come from the nexthop to the target.
1630 * Due to the way the routes are chosen, this notion
1631 * is a bit fuzzy and one might need to check all possible
1635 read_lock_bh(&table->tb6_lock);
1636 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1638 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1640 * Current route is on-link; redirect is always invalid.
1642 * Seems, previous statement is not true. It could
1643 * be node, which looks for us as on-link (f.e. proxy ndisc)
1644 * But then router serving it might decide, that we should
1645 * know truth 8)8) --ANK (980726).
1647 if (rt6_check_expired(rt))
1649 if (!(rt->rt6i_flags & RTF_GATEWAY))
1651 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1653 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1659 rt = net->ipv6.ip6_null_entry;
1660 BACKTRACK(net, &fl6->saddr);
1664 read_unlock_bh(&table->tb6_lock);
1669 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1670 const struct in6_addr *src,
1671 const struct in6_addr *gateway,
1672 struct net_device *dev)
1674 int flags = RT6_LOOKUP_F_HAS_SADDR;
1675 struct net *net = dev_net(dev);
1676 struct ip6rd_flowi rdfl = {
1678 .flowi6_oif = dev->ifindex,
1684 rdfl.gateway = *gateway;
1686 if (rt6_need_strict(dest))
1687 flags |= RT6_LOOKUP_F_IFACE;
1689 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1690 flags, __ip6_route_redirect);
1693 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1694 const struct in6_addr *saddr,
1695 struct neighbour *neigh, u8 *lladdr, int on_link)
1697 struct rt6_info *rt, *nrt = NULL;
1698 struct netevent_redirect netevent;
1699 struct net *net = dev_net(neigh->dev);
1700 struct neighbour *old_neigh;
1702 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1704 if (rt == net->ipv6.ip6_null_entry) {
1705 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1710 * We have finally decided to accept it.
1713 neigh_update(neigh, lladdr, NUD_STALE,
1714 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1715 NEIGH_UPDATE_F_OVERRIDE|
1716 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1717 NEIGH_UPDATE_F_ISROUTER))
1721 * Redirect received -> path was valid.
1722 * Look, redirects are sent only in response to data packets,
1723 * so that this nexthop apparently is reachable. --ANK
1725 dst_confirm(&rt->dst);
1727 /* Duplicate redirect: silently ignore. */
1729 if (neigh == old_neigh)
1732 nrt = ip6_rt_copy(rt, dest);
1736 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1738 nrt->rt6i_flags &= ~RTF_GATEWAY;
1740 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1741 nrt->n = neigh_clone(neigh);
1743 if (ip6_ins_rt(nrt))
1746 netevent.old = &rt->dst;
1747 netevent.old_neigh = old_neigh;
1748 netevent.new = &nrt->dst;
1749 netevent.new_neigh = neigh;
1750 netevent.daddr = dest;
1751 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1753 if (rt->rt6i_flags & RTF_CACHE) {
1759 dst_release(&rt->dst);
1763 * Misc support functions
1766 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1767 const struct in6_addr *dest)
1769 struct net *net = dev_net(ort->dst.dev);
1770 struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1774 rt->dst.input = ort->dst.input;
1775 rt->dst.output = ort->dst.output;
1776 rt->dst.flags |= DST_HOST;
1778 rt->rt6i_dst.addr = *dest;
1779 rt->rt6i_dst.plen = 128;
1780 dst_copy_metrics(&rt->dst, &ort->dst);
1781 rt->dst.error = ort->dst.error;
1782 rt->rt6i_idev = ort->rt6i_idev;
1784 in6_dev_hold(rt->rt6i_idev);
1785 rt->dst.lastuse = jiffies;
1787 rt->rt6i_gateway = ort->rt6i_gateway;
1788 rt->rt6i_flags = ort->rt6i_flags;
1789 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1790 (RTF_DEFAULT | RTF_ADDRCONF))
1791 rt6_set_from(rt, ort);
1793 rt6_clean_expires(rt);
1794 rt->rt6i_metric = 0;
1796 #ifdef CONFIG_IPV6_SUBTREES
1797 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1799 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1800 rt->rt6i_table = ort->rt6i_table;
1805 #ifdef CONFIG_IPV6_ROUTE_INFO
1806 static struct rt6_info *rt6_get_route_info(struct net *net,
1807 const struct in6_addr *prefix, int prefixlen,
1808 const struct in6_addr *gwaddr, int ifindex)
1810 struct fib6_node *fn;
1811 struct rt6_info *rt = NULL;
1812 struct fib6_table *table;
1814 table = fib6_get_table(net, RT6_TABLE_INFO);
1818 write_lock_bh(&table->tb6_lock);
1819 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1823 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1824 if (rt->dst.dev->ifindex != ifindex)
1826 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1828 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1834 write_unlock_bh(&table->tb6_lock);
1838 static struct rt6_info *rt6_add_route_info(struct net *net,
1839 const struct in6_addr *prefix, int prefixlen,
1840 const struct in6_addr *gwaddr, int ifindex,
1843 struct fib6_config cfg = {
1844 .fc_table = RT6_TABLE_INFO,
1845 .fc_metric = IP6_RT_PRIO_USER,
1846 .fc_ifindex = ifindex,
1847 .fc_dst_len = prefixlen,
1848 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1849 RTF_UP | RTF_PREF(pref),
1851 .fc_nlinfo.nlh = NULL,
1852 .fc_nlinfo.nl_net = net,
1855 cfg.fc_dst = *prefix;
1856 cfg.fc_gateway = *gwaddr;
1858 /* We should treat it as a default route if prefix length is 0. */
1860 cfg.fc_flags |= RTF_DEFAULT;
1862 ip6_route_add(&cfg);
1864 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1868 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1870 struct rt6_info *rt;
1871 struct fib6_table *table;
1873 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1877 write_lock_bh(&table->tb6_lock);
1878 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1879 if (dev == rt->dst.dev &&
1880 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1881 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1886 write_unlock_bh(&table->tb6_lock);
1890 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1891 struct net_device *dev,
1894 struct fib6_config cfg = {
1895 .fc_table = RT6_TABLE_DFLT,
1896 .fc_metric = IP6_RT_PRIO_USER,
1897 .fc_ifindex = dev->ifindex,
1898 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1899 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1901 .fc_nlinfo.nlh = NULL,
1902 .fc_nlinfo.nl_net = dev_net(dev),
1905 cfg.fc_gateway = *gwaddr;
1907 ip6_route_add(&cfg);
1909 return rt6_get_dflt_router(gwaddr, dev);
1912 void rt6_purge_dflt_routers(struct net *net)
1914 struct rt6_info *rt;
1915 struct fib6_table *table;
1917 /* NOTE: Keep consistent with rt6_get_dflt_router */
1918 table = fib6_get_table(net, RT6_TABLE_DFLT);
1923 read_lock_bh(&table->tb6_lock);
1924 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1925 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1927 read_unlock_bh(&table->tb6_lock);
1932 read_unlock_bh(&table->tb6_lock);
1935 static void rtmsg_to_fib6_config(struct net *net,
1936 struct in6_rtmsg *rtmsg,
1937 struct fib6_config *cfg)
1939 memset(cfg, 0, sizeof(*cfg));
1941 cfg->fc_table = RT6_TABLE_MAIN;
1942 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1943 cfg->fc_metric = rtmsg->rtmsg_metric;
1944 cfg->fc_expires = rtmsg->rtmsg_info;
1945 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1946 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1947 cfg->fc_flags = rtmsg->rtmsg_flags;
1949 cfg->fc_nlinfo.nl_net = net;
1951 cfg->fc_dst = rtmsg->rtmsg_dst;
1952 cfg->fc_src = rtmsg->rtmsg_src;
1953 cfg->fc_gateway = rtmsg->rtmsg_gateway;
1956 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1958 struct fib6_config cfg;
1959 struct in6_rtmsg rtmsg;
1963 case SIOCADDRT: /* Add a route */
1964 case SIOCDELRT: /* Delete a route */
1965 if (!capable(CAP_NET_ADMIN))
1967 err = copy_from_user(&rtmsg, arg,
1968 sizeof(struct in6_rtmsg));
1972 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1977 err = ip6_route_add(&cfg);
1980 err = ip6_route_del(&cfg);
1994 * Drop the packet on the floor
1997 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2000 struct dst_entry *dst = skb_dst(skb);
2001 switch (ipstats_mib_noroutes) {
2002 case IPSTATS_MIB_INNOROUTES:
2003 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2004 if (type == IPV6_ADDR_ANY) {
2005 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2006 IPSTATS_MIB_INADDRERRORS);
2010 case IPSTATS_MIB_OUTNOROUTES:
2011 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2012 ipstats_mib_noroutes);
2015 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2020 static int ip6_pkt_discard(struct sk_buff *skb)
2022 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2025 static int ip6_pkt_discard_out(struct sk_buff *skb)
2027 skb->dev = skb_dst(skb)->dev;
2028 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2031 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2033 static int ip6_pkt_prohibit(struct sk_buff *skb)
2035 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2038 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2040 skb->dev = skb_dst(skb)->dev;
2041 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2047 * Allocate a dst for local (unicast / anycast) address.
2050 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2051 const struct in6_addr *addr,
2054 struct net *net = dev_net(idev->dev);
2055 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2059 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2060 return ERR_PTR(-ENOMEM);
2065 rt->dst.flags |= DST_HOST;
2066 rt->dst.input = ip6_input;
2067 rt->dst.output = ip6_output;
2068 rt->rt6i_idev = idev;
2069 rt->dst.obsolete = -1;
2071 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2073 rt->rt6i_flags |= RTF_ANYCAST;
2075 rt->rt6i_flags |= RTF_LOCAL;
2076 err = rt6_bind_neighbour(rt, rt->dst.dev);
2079 return ERR_PTR(err);
2082 rt->rt6i_dst.addr = *addr;
2083 rt->rt6i_dst.plen = 128;
2084 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2086 atomic_set(&rt->dst.__refcnt, 1);
2091 int ip6_route_get_saddr(struct net *net,
2092 struct rt6_info *rt,
2093 const struct in6_addr *daddr,
2095 struct in6_addr *saddr)
2097 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2099 if (rt->rt6i_prefsrc.plen)
2100 *saddr = rt->rt6i_prefsrc.addr;
2102 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2103 daddr, prefs, saddr);
2107 /* remove deleted ip from prefsrc entries */
2108 struct arg_dev_net_ip {
2109 struct net_device *dev;
2111 struct in6_addr *addr;
2114 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2116 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2117 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2118 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2120 if (((void *)rt->dst.dev == dev || !dev) &&
2121 rt != net->ipv6.ip6_null_entry &&
2122 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2123 /* remove prefsrc entry */
2124 rt->rt6i_prefsrc.plen = 0;
2129 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2131 struct net *net = dev_net(ifp->idev->dev);
2132 struct arg_dev_net_ip adni = {
2133 .dev = ifp->idev->dev,
2137 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2140 struct arg_dev_net {
2141 struct net_device *dev;
2145 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2147 const struct arg_dev_net *adn = arg;
2148 const struct net_device *dev = adn->dev;
2150 if ((rt->dst.dev == dev || !dev) &&
2151 rt != adn->net->ipv6.ip6_null_entry)
2157 void rt6_ifdown(struct net *net, struct net_device *dev)
2159 struct arg_dev_net adn = {
2164 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2165 icmp6_clean_all(fib6_ifdown, &adn);
2168 struct rt6_mtu_change_arg {
2169 struct net_device *dev;
2173 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2175 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2176 struct inet6_dev *idev;
2178 /* In IPv6 pmtu discovery is not optional,
2179 so that RTAX_MTU lock cannot disable it.
2180 We still use this lock to block changes
2181 caused by addrconf/ndisc.
2184 idev = __in6_dev_get(arg->dev);
2188 /* For administrative MTU increase, there is no way to discover
2189 IPv6 PMTU increase, so PMTU increase should be updated here.
2190 Since RFC 1981 doesn't include administrative MTU increase
2191 update PMTU increase is a MUST. (i.e. jumbo frame)
2194 If new MTU is less than route PMTU, this new MTU will be the
2195 lowest MTU in the path, update the route PMTU to reflect PMTU
2196 decreases; if new MTU is greater than route PMTU, and the
2197 old MTU is the lowest MTU in the path, update the route PMTU
2198 to reflect the increase. In this case if the other nodes' MTU
2199 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2202 if (rt->dst.dev == arg->dev &&
2203 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2204 (dst_mtu(&rt->dst) >= arg->mtu ||
2205 (dst_mtu(&rt->dst) < arg->mtu &&
2206 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2207 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2212 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2214 struct rt6_mtu_change_arg arg = {
2219 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2222 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2223 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2224 [RTA_OIF] = { .type = NLA_U32 },
2225 [RTA_IIF] = { .type = NLA_U32 },
2226 [RTA_PRIORITY] = { .type = NLA_U32 },
2227 [RTA_METRICS] = { .type = NLA_NESTED },
2230 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2231 struct fib6_config *cfg)
2234 struct nlattr *tb[RTA_MAX+1];
2237 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2242 rtm = nlmsg_data(nlh);
2243 memset(cfg, 0, sizeof(*cfg));
2245 cfg->fc_table = rtm->rtm_table;
2246 cfg->fc_dst_len = rtm->rtm_dst_len;
2247 cfg->fc_src_len = rtm->rtm_src_len;
2248 cfg->fc_flags = RTF_UP;
2249 cfg->fc_protocol = rtm->rtm_protocol;
2251 if (rtm->rtm_type == RTN_UNREACHABLE)
2252 cfg->fc_flags |= RTF_REJECT;
2254 if (rtm->rtm_type == RTN_LOCAL)
2255 cfg->fc_flags |= RTF_LOCAL;
2257 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2258 cfg->fc_nlinfo.nlh = nlh;
2259 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2261 if (tb[RTA_GATEWAY]) {
2262 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2263 cfg->fc_flags |= RTF_GATEWAY;
2267 int plen = (rtm->rtm_dst_len + 7) >> 3;
2269 if (nla_len(tb[RTA_DST]) < plen)
2272 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2276 int plen = (rtm->rtm_src_len + 7) >> 3;
2278 if (nla_len(tb[RTA_SRC]) < plen)
2281 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2284 if (tb[RTA_PREFSRC])
2285 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2288 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2290 if (tb[RTA_PRIORITY])
2291 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2293 if (tb[RTA_METRICS]) {
2294 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2295 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2299 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2306 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2308 struct fib6_config cfg;
2311 err = rtm_to_fib6_config(skb, nlh, &cfg);
2315 return ip6_route_del(&cfg);
2318 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2320 struct fib6_config cfg;
2323 err = rtm_to_fib6_config(skb, nlh, &cfg);
2327 return ip6_route_add(&cfg);
2330 static inline size_t rt6_nlmsg_size(void)
2332 return NLMSG_ALIGN(sizeof(struct rtmsg))
2333 + nla_total_size(16) /* RTA_SRC */
2334 + nla_total_size(16) /* RTA_DST */
2335 + nla_total_size(16) /* RTA_GATEWAY */
2336 + nla_total_size(16) /* RTA_PREFSRC */
2337 + nla_total_size(4) /* RTA_TABLE */
2338 + nla_total_size(4) /* RTA_IIF */
2339 + nla_total_size(4) /* RTA_OIF */
2340 + nla_total_size(4) /* RTA_PRIORITY */
2341 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2342 + nla_total_size(sizeof(struct rta_cacheinfo));
2345 static int rt6_fill_node(struct net *net,
2346 struct sk_buff *skb, struct rt6_info *rt,
2347 struct in6_addr *dst, struct in6_addr *src,
2348 int iif, int type, u32 pid, u32 seq,
2349 int prefix, int nowait, unsigned int flags)
2351 const struct inet_peer *peer;
2353 struct nlmsghdr *nlh;
2356 struct neighbour *n;
2359 if (prefix) { /* user wants prefix routes only */
2360 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2361 /* success since this is not a prefix route */
2366 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2370 rtm = nlmsg_data(nlh);
2371 rtm->rtm_family = AF_INET6;
2372 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2373 rtm->rtm_src_len = rt->rt6i_src.plen;
2376 table = rt->rt6i_table->tb6_id;
2378 table = RT6_TABLE_UNSPEC;
2379 rtm->rtm_table = table;
2380 if (nla_put_u32(skb, RTA_TABLE, table))
2381 goto nla_put_failure;
2382 if (rt->rt6i_flags & RTF_REJECT)
2383 rtm->rtm_type = RTN_UNREACHABLE;
2384 else if (rt->rt6i_flags & RTF_LOCAL)
2385 rtm->rtm_type = RTN_LOCAL;
2386 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2387 rtm->rtm_type = RTN_LOCAL;
2389 rtm->rtm_type = RTN_UNICAST;
2391 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2392 rtm->rtm_protocol = rt->rt6i_protocol;
2393 if (rt->rt6i_flags & RTF_DYNAMIC)
2394 rtm->rtm_protocol = RTPROT_REDIRECT;
2395 else if (rt->rt6i_flags & RTF_ADDRCONF)
2396 rtm->rtm_protocol = RTPROT_KERNEL;
2397 else if (rt->rt6i_flags & RTF_DEFAULT)
2398 rtm->rtm_protocol = RTPROT_RA;
2400 if (rt->rt6i_flags & RTF_CACHE)
2401 rtm->rtm_flags |= RTM_F_CLONED;
2404 if (nla_put(skb, RTA_DST, 16, dst))
2405 goto nla_put_failure;
2406 rtm->rtm_dst_len = 128;
2407 } else if (rtm->rtm_dst_len)
2408 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2409 goto nla_put_failure;
2410 #ifdef CONFIG_IPV6_SUBTREES
2412 if (nla_put(skb, RTA_SRC, 16, src))
2413 goto nla_put_failure;
2414 rtm->rtm_src_len = 128;
2415 } else if (rtm->rtm_src_len &&
2416 nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2417 goto nla_put_failure;
2420 #ifdef CONFIG_IPV6_MROUTE
2421 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2422 int err = ip6mr_get_route(net, skb, rtm, nowait);
2427 goto nla_put_failure;
2429 if (err == -EMSGSIZE)
2430 goto nla_put_failure;
2435 if (nla_put_u32(skb, RTA_IIF, iif))
2436 goto nla_put_failure;
2438 struct in6_addr saddr_buf;
2439 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2440 nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2441 goto nla_put_failure;
2444 if (rt->rt6i_prefsrc.plen) {
2445 struct in6_addr saddr_buf;
2446 saddr_buf = rt->rt6i_prefsrc.addr;
2447 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2448 goto nla_put_failure;
2451 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2452 goto nla_put_failure;
2457 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2459 goto nla_put_failure;
2465 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2466 goto nla_put_failure;
2467 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2468 goto nla_put_failure;
2469 if (!(rt->rt6i_flags & RTF_EXPIRES))
2471 else if (rt->dst.expires - jiffies < INT_MAX)
2472 expires = rt->dst.expires - jiffies;
2477 if (rt6_has_peer(rt))
2478 peer = rt6_peer_ptr(rt);
2480 if (peer && peer->tcp_ts_stamp) {
2482 tsage = get_seconds() - peer->tcp_ts_stamp;
2485 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2486 expires, rt->dst.error) < 0)
2487 goto nla_put_failure;
2489 return nlmsg_end(skb, nlh);
2492 nlmsg_cancel(skb, nlh);
2496 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2498 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2501 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2502 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2503 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2507 return rt6_fill_node(arg->net,
2508 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2509 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2510 prefix, 0, NLM_F_MULTI);
2513 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2515 struct net *net = sock_net(in_skb->sk);
2516 struct nlattr *tb[RTA_MAX+1];
2517 struct rt6_info *rt;
2518 struct sk_buff *skb;
2521 int err, iif = 0, oif = 0;
2523 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2528 memset(&fl6, 0, sizeof(fl6));
2531 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2534 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2538 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2541 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2545 iif = nla_get_u32(tb[RTA_IIF]);
2548 oif = nla_get_u32(tb[RTA_OIF]);
2551 struct net_device *dev;
2554 dev = __dev_get_by_index(net, iif);
2560 fl6.flowi6_iif = iif;
2562 if (!ipv6_addr_any(&fl6.saddr))
2563 flags |= RT6_LOOKUP_F_HAS_SADDR;
2565 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2568 fl6.flowi6_oif = oif;
2570 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2573 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2575 dst_release(&rt->dst);
2580 /* Reserve room for dummy headers, this skb can pass
2581 through good chunk of routing engine.
2583 skb_reset_mac_header(skb);
2584 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2586 skb_dst_set(skb, &rt->dst);
2588 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2589 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2590 nlh->nlmsg_seq, 0, 0, 0);
2596 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2601 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2603 struct sk_buff *skb;
2604 struct net *net = info->nl_net;
2609 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2611 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2615 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2616 event, info->pid, seq, 0, 0, 0);
2618 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2619 WARN_ON(err == -EMSGSIZE);
2623 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2624 info->nlh, gfp_any());
2628 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2631 static int ip6_route_dev_notify(struct notifier_block *this,
2632 unsigned long event, void *data)
2634 struct net_device *dev = (struct net_device *)data;
2635 struct net *net = dev_net(dev);
2637 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2638 net->ipv6.ip6_null_entry->dst.dev = dev;
2639 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2640 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2641 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2642 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2643 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2644 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2655 #ifdef CONFIG_PROC_FS
2666 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2668 struct seq_file *m = p_arg;
2669 struct neighbour *n;
2671 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2673 #ifdef CONFIG_IPV6_SUBTREES
2674 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2676 seq_puts(m, "00000000000000000000000000000000 00 ");
2681 seq_printf(m, "%pi6", n->primary_key);
2683 seq_puts(m, "00000000000000000000000000000000");
2686 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2687 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2688 rt->dst.__use, rt->rt6i_flags,
2689 rt->dst.dev ? rt->dst.dev->name : "");
2693 static int ipv6_route_show(struct seq_file *m, void *v)
2695 struct net *net = (struct net *)m->private;
2696 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2700 static int ipv6_route_open(struct inode *inode, struct file *file)
2702 return single_open_net(inode, file, ipv6_route_show);
2705 static const struct file_operations ipv6_route_proc_fops = {
2706 .owner = THIS_MODULE,
2707 .open = ipv6_route_open,
2709 .llseek = seq_lseek,
2710 .release = single_release_net,
2713 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2715 struct net *net = (struct net *)seq->private;
2716 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2717 net->ipv6.rt6_stats->fib_nodes,
2718 net->ipv6.rt6_stats->fib_route_nodes,
2719 net->ipv6.rt6_stats->fib_rt_alloc,
2720 net->ipv6.rt6_stats->fib_rt_entries,
2721 net->ipv6.rt6_stats->fib_rt_cache,
2722 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2723 net->ipv6.rt6_stats->fib_discarded_routes);
2728 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2730 return single_open_net(inode, file, rt6_stats_seq_show);
2733 static const struct file_operations rt6_stats_seq_fops = {
2734 .owner = THIS_MODULE,
2735 .open = rt6_stats_seq_open,
2737 .llseek = seq_lseek,
2738 .release = single_release_net,
2740 #endif /* CONFIG_PROC_FS */
2742 #ifdef CONFIG_SYSCTL
2745 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2746 void __user *buffer, size_t *lenp, loff_t *ppos)
2753 net = (struct net *)ctl->extra1;
2754 delay = net->ipv6.sysctl.flush_delay;
2755 proc_dointvec(ctl, write, buffer, lenp, ppos);
2756 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2760 ctl_table ipv6_route_table_template[] = {
2762 .procname = "flush",
2763 .data = &init_net.ipv6.sysctl.flush_delay,
2764 .maxlen = sizeof(int),
2766 .proc_handler = ipv6_sysctl_rtcache_flush
2769 .procname = "gc_thresh",
2770 .data = &ip6_dst_ops_template.gc_thresh,
2771 .maxlen = sizeof(int),
2773 .proc_handler = proc_dointvec,
2776 .procname = "max_size",
2777 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2778 .maxlen = sizeof(int),
2780 .proc_handler = proc_dointvec,
2783 .procname = "gc_min_interval",
2784 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2785 .maxlen = sizeof(int),
2787 .proc_handler = proc_dointvec_jiffies,
2790 .procname = "gc_timeout",
2791 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2792 .maxlen = sizeof(int),
2794 .proc_handler = proc_dointvec_jiffies,
2797 .procname = "gc_interval",
2798 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2799 .maxlen = sizeof(int),
2801 .proc_handler = proc_dointvec_jiffies,
2804 .procname = "gc_elasticity",
2805 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2806 .maxlen = sizeof(int),
2808 .proc_handler = proc_dointvec,
2811 .procname = "mtu_expires",
2812 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2813 .maxlen = sizeof(int),
2815 .proc_handler = proc_dointvec_jiffies,
2818 .procname = "min_adv_mss",
2819 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2820 .maxlen = sizeof(int),
2822 .proc_handler = proc_dointvec,
2825 .procname = "gc_min_interval_ms",
2826 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2827 .maxlen = sizeof(int),
2829 .proc_handler = proc_dointvec_ms_jiffies,
2834 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2836 struct ctl_table *table;
2838 table = kmemdup(ipv6_route_table_template,
2839 sizeof(ipv6_route_table_template),
2843 table[0].data = &net->ipv6.sysctl.flush_delay;
2844 table[0].extra1 = net;
2845 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2846 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2847 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2848 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2849 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2850 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2851 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2852 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2853 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2860 static int __net_init ip6_route_net_init(struct net *net)
2864 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2865 sizeof(net->ipv6.ip6_dst_ops));
2867 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2868 goto out_ip6_dst_ops;
2870 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2871 sizeof(*net->ipv6.ip6_null_entry),
2873 if (!net->ipv6.ip6_null_entry)
2874 goto out_ip6_dst_entries;
2875 net->ipv6.ip6_null_entry->dst.path =
2876 (struct dst_entry *)net->ipv6.ip6_null_entry;
2877 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2878 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2879 ip6_template_metrics, true);
2881 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2882 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2883 sizeof(*net->ipv6.ip6_prohibit_entry),
2885 if (!net->ipv6.ip6_prohibit_entry)
2886 goto out_ip6_null_entry;
2887 net->ipv6.ip6_prohibit_entry->dst.path =
2888 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2889 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2890 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2891 ip6_template_metrics, true);
2893 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2894 sizeof(*net->ipv6.ip6_blk_hole_entry),
2896 if (!net->ipv6.ip6_blk_hole_entry)
2897 goto out_ip6_prohibit_entry;
2898 net->ipv6.ip6_blk_hole_entry->dst.path =
2899 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2900 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2901 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2902 ip6_template_metrics, true);
2905 net->ipv6.sysctl.flush_delay = 0;
2906 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2907 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2908 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2909 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2910 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2911 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2912 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2914 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2920 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2921 out_ip6_prohibit_entry:
2922 kfree(net->ipv6.ip6_prohibit_entry);
2924 kfree(net->ipv6.ip6_null_entry);
2926 out_ip6_dst_entries:
2927 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2932 static void __net_exit ip6_route_net_exit(struct net *net)
2934 kfree(net->ipv6.ip6_null_entry);
2935 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2936 kfree(net->ipv6.ip6_prohibit_entry);
2937 kfree(net->ipv6.ip6_blk_hole_entry);
2939 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2942 static int __net_init ip6_route_net_init_late(struct net *net)
2944 #ifdef CONFIG_PROC_FS
2945 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2946 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2951 static void __net_exit ip6_route_net_exit_late(struct net *net)
2953 #ifdef CONFIG_PROC_FS
2954 proc_net_remove(net, "ipv6_route");
2955 proc_net_remove(net, "rt6_stats");
2959 static struct pernet_operations ip6_route_net_ops = {
2960 .init = ip6_route_net_init,
2961 .exit = ip6_route_net_exit,
2964 static int __net_init ipv6_inetpeer_init(struct net *net)
2966 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2970 inet_peer_base_init(bp);
2971 net->ipv6.peers = bp;
2975 static void __net_exit ipv6_inetpeer_exit(struct net *net)
2977 struct inet_peer_base *bp = net->ipv6.peers;
2979 net->ipv6.peers = NULL;
2980 inetpeer_invalidate_tree(bp);
2984 static struct pernet_operations ipv6_inetpeer_ops = {
2985 .init = ipv6_inetpeer_init,
2986 .exit = ipv6_inetpeer_exit,
2989 static struct pernet_operations ip6_route_net_late_ops = {
2990 .init = ip6_route_net_init_late,
2991 .exit = ip6_route_net_exit_late,
2994 static struct notifier_block ip6_route_dev_notifier = {
2995 .notifier_call = ip6_route_dev_notify,
2999 int __init ip6_route_init(void)
3004 ip6_dst_ops_template.kmem_cachep =
3005 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3006 SLAB_HWCACHE_ALIGN, NULL);
3007 if (!ip6_dst_ops_template.kmem_cachep)
3010 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3012 goto out_kmem_cache;
3014 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3016 goto out_dst_entries;
3018 ret = register_pernet_subsys(&ip6_route_net_ops);
3020 goto out_register_inetpeer;
3022 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3024 /* Registering of the loopback is done before this portion of code,
3025 * the loopback reference in rt6_info will not be taken, do it
3026 * manually for init_net */
3027 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3028 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3029 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3030 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3031 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3032 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3033 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3037 goto out_register_subsys;
3043 ret = fib6_rules_init();
3047 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3049 goto fib6_rules_init;
3052 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3053 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3054 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3055 goto out_register_late_subsys;
3057 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3059 goto out_register_late_subsys;
3064 out_register_late_subsys:
3065 unregister_pernet_subsys(&ip6_route_net_late_ops);
3067 fib6_rules_cleanup();
3072 out_register_subsys:
3073 unregister_pernet_subsys(&ip6_route_net_ops);
3074 out_register_inetpeer:
3075 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3077 dst_entries_destroy(&ip6_dst_blackhole_ops);
3079 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3083 void ip6_route_cleanup(void)
3085 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3086 unregister_pernet_subsys(&ip6_route_net_late_ops);
3087 fib6_rules_cleanup();
3090 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3091 unregister_pernet_subsys(&ip6_route_net_ops);
3092 dst_entries_destroy(&ip6_dst_blackhole_ops);
3093 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);