2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
61 #include <asm/uaccess.h>
64 #include <linux/sysctl.h>
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68 const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void ip6_dst_destroy(struct dst_entry *);
74 static void ip6_dst_ifdown(struct dst_entry *,
75 struct net_device *dev, int how);
76 static int ip6_dst_gc(struct dst_ops *ops);
78 static int ip6_pkt_discard(struct sk_buff *skb);
79 static int ip6_pkt_discard_out(struct sk_buff *skb);
80 static void ip6_link_failure(struct sk_buff *skb);
81 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
82 static void rt6_do_redirect(struct dst_entry *dst, struct sk_buff *skb);
84 #ifdef CONFIG_IPV6_ROUTE_INFO
85 static struct rt6_info *rt6_add_route_info(struct net *net,
86 const struct in6_addr *prefix, int prefixlen,
87 const struct in6_addr *gwaddr, int ifindex,
89 static struct rt6_info *rt6_get_route_info(struct net *net,
90 const struct in6_addr *prefix, int prefixlen,
91 const struct in6_addr *gwaddr, int ifindex);
94 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
96 struct rt6_info *rt = (struct rt6_info *) dst;
97 struct inet_peer *peer;
100 if (!(rt->dst.flags & DST_HOST))
103 peer = rt6_get_peer_create(rt);
105 u32 *old_p = __DST_METRICS_PTR(old);
106 unsigned long prev, new;
109 if (inet_metrics_new(peer))
110 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
112 new = (unsigned long) p;
113 prev = cmpxchg(&dst->_metrics, old, new);
116 p = __DST_METRICS_PTR(prev);
117 if (prev & DST_METRICS_READ_ONLY)
124 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
128 struct in6_addr *p = &rt->rt6i_gateway;
130 if (!ipv6_addr_any(p))
131 return (const void *) p;
133 return &ipv6_hdr(skb)->daddr;
137 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
141 struct rt6_info *rt = (struct rt6_info *) dst;
144 daddr = choose_neigh_daddr(rt, skb, daddr);
145 n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
148 return neigh_create(&nd_tbl, daddr, dst->dev);
151 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
153 struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
155 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
164 static struct dst_ops ip6_dst_ops_template = {
166 .protocol = cpu_to_be16(ETH_P_IPV6),
169 .check = ip6_dst_check,
170 .default_advmss = ip6_default_advmss,
172 .cow_metrics = ipv6_cow_metrics,
173 .destroy = ip6_dst_destroy,
174 .ifdown = ip6_dst_ifdown,
175 .negative_advice = ip6_negative_advice,
176 .link_failure = ip6_link_failure,
177 .update_pmtu = ip6_rt_update_pmtu,
178 .redirect = rt6_do_redirect,
179 .local_out = __ip6_local_out,
180 .neigh_lookup = ip6_neigh_lookup,
183 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
185 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
187 return mtu ? : dst->dev->mtu;
190 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
194 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
200 static struct dst_ops ip6_dst_blackhole_ops = {
202 .protocol = cpu_to_be16(ETH_P_IPV6),
203 .destroy = ip6_dst_destroy,
204 .check = ip6_dst_check,
205 .mtu = ip6_blackhole_mtu,
206 .default_advmss = ip6_default_advmss,
207 .update_pmtu = ip6_rt_blackhole_update_pmtu,
208 .cow_metrics = ip6_rt_blackhole_cow_metrics,
209 .neigh_lookup = ip6_neigh_lookup,
212 static const u32 ip6_template_metrics[RTAX_MAX] = {
213 [RTAX_HOPLIMIT - 1] = 255,
216 static struct rt6_info ip6_null_entry_template = {
218 .__refcnt = ATOMIC_INIT(1),
221 .error = -ENETUNREACH,
222 .input = ip6_pkt_discard,
223 .output = ip6_pkt_discard_out,
225 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
226 .rt6i_protocol = RTPROT_KERNEL,
227 .rt6i_metric = ~(u32) 0,
228 .rt6i_ref = ATOMIC_INIT(1),
231 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
233 static int ip6_pkt_prohibit(struct sk_buff *skb);
234 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
236 static struct rt6_info ip6_prohibit_entry_template = {
238 .__refcnt = ATOMIC_INIT(1),
242 .input = ip6_pkt_prohibit,
243 .output = ip6_pkt_prohibit_out,
245 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
246 .rt6i_protocol = RTPROT_KERNEL,
247 .rt6i_metric = ~(u32) 0,
248 .rt6i_ref = ATOMIC_INIT(1),
251 static struct rt6_info ip6_blk_hole_entry_template = {
253 .__refcnt = ATOMIC_INIT(1),
257 .input = dst_discard,
258 .output = dst_discard,
260 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
261 .rt6i_protocol = RTPROT_KERNEL,
262 .rt6i_metric = ~(u32) 0,
263 .rt6i_ref = ATOMIC_INIT(1),
268 /* allocate dst with ip6_dst_ops */
269 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
270 struct net_device *dev,
272 struct fib6_table *table)
274 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
279 sizeof(*rt) - sizeof(struct dst_entry));
280 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
285 static void ip6_dst_destroy(struct dst_entry *dst)
287 struct rt6_info *rt = (struct rt6_info *)dst;
288 struct inet6_dev *idev = rt->rt6i_idev;
291 neigh_release(rt->n);
293 if (!(rt->dst.flags & DST_HOST))
294 dst_destroy_metrics_generic(dst);
297 rt->rt6i_idev = NULL;
301 if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
302 dst_release(dst->from);
304 if (rt6_has_peer(rt)) {
305 struct inet_peer *peer = rt6_peer_ptr(rt);
310 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
312 static u32 rt6_peer_genid(void)
314 return atomic_read(&__rt6_peer_genid);
317 void rt6_bind_peer(struct rt6_info *rt, int create)
319 struct inet_peer_base *base;
320 struct inet_peer *peer;
322 base = inetpeer_base_ptr(rt->_rt6i_peer);
326 peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
328 if (!rt6_set_peer(rt, peer))
331 rt->rt6i_peer_genid = rt6_peer_genid();
335 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
338 struct rt6_info *rt = (struct rt6_info *)dst;
339 struct inet6_dev *idev = rt->rt6i_idev;
340 struct net_device *loopback_dev =
341 dev_net(dev)->loopback_dev;
343 if (dev != loopback_dev) {
344 if (idev && idev->dev == dev) {
345 struct inet6_dev *loopback_idev =
346 in6_dev_get(loopback_dev);
348 rt->rt6i_idev = loopback_idev;
352 if (rt->n && rt->n->dev == dev) {
353 rt->n->dev = loopback_dev;
354 dev_hold(loopback_dev);
360 static bool rt6_check_expired(const struct rt6_info *rt)
362 struct rt6_info *ort = NULL;
364 if (rt->rt6i_flags & RTF_EXPIRES) {
365 if (time_after(jiffies, rt->dst.expires))
367 } else if (rt->dst.from) {
368 ort = (struct rt6_info *) rt->dst.from;
369 return (ort->rt6i_flags & RTF_EXPIRES) &&
370 time_after(jiffies, ort->dst.expires);
375 static bool rt6_need_strict(const struct in6_addr *daddr)
377 return ipv6_addr_type(daddr) &
378 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
382 * Route lookup. Any table->tb6_lock is implied.
385 static inline struct rt6_info *rt6_device_match(struct net *net,
387 const struct in6_addr *saddr,
391 struct rt6_info *local = NULL;
392 struct rt6_info *sprt;
394 if (!oif && ipv6_addr_any(saddr))
397 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
398 struct net_device *dev = sprt->dst.dev;
401 if (dev->ifindex == oif)
403 if (dev->flags & IFF_LOOPBACK) {
404 if (!sprt->rt6i_idev ||
405 sprt->rt6i_idev->dev->ifindex != oif) {
406 if (flags & RT6_LOOKUP_F_IFACE && oif)
408 if (local && (!oif ||
409 local->rt6i_idev->dev->ifindex == oif))
415 if (ipv6_chk_addr(net, saddr, dev,
416 flags & RT6_LOOKUP_F_IFACE))
425 if (flags & RT6_LOOKUP_F_IFACE)
426 return net->ipv6.ip6_null_entry;
432 #ifdef CONFIG_IPV6_ROUTER_PREF
433 static void rt6_probe(struct rt6_info *rt)
435 struct neighbour *neigh;
437 * Okay, this does not seem to be appropriate
438 * for now, however, we need to check if it
439 * is really so; aka Router Reachability Probing.
441 * Router Reachability Probe MUST be rate-limited
442 * to no more than one per minute.
445 neigh = rt ? rt->n : NULL;
446 if (!neigh || (neigh->nud_state & NUD_VALID))
448 read_lock_bh(&neigh->lock);
449 if (!(neigh->nud_state & NUD_VALID) &&
450 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
451 struct in6_addr mcaddr;
452 struct in6_addr *target;
454 neigh->updated = jiffies;
455 read_unlock_bh(&neigh->lock);
457 target = (struct in6_addr *)&neigh->primary_key;
458 addrconf_addr_solict_mult(target, &mcaddr);
459 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
461 read_unlock_bh(&neigh->lock);
467 static inline void rt6_probe(struct rt6_info *rt)
473 * Default Router Selection (RFC 2461 6.3.6)
475 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
477 struct net_device *dev = rt->dst.dev;
478 if (!oif || dev->ifindex == oif)
480 if ((dev->flags & IFF_LOOPBACK) &&
481 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
486 static inline int rt6_check_neigh(struct rt6_info *rt)
488 struct neighbour *neigh;
493 if (rt->rt6i_flags & RTF_NONEXTHOP ||
494 !(rt->rt6i_flags & RTF_GATEWAY))
497 read_lock_bh(&neigh->lock);
498 if (neigh->nud_state & NUD_VALID)
500 #ifdef CONFIG_IPV6_ROUTER_PREF
501 else if (neigh->nud_state & NUD_FAILED)
506 read_unlock_bh(&neigh->lock);
513 static int rt6_score_route(struct rt6_info *rt, int oif,
518 m = rt6_check_dev(rt, oif);
519 if (!m && (strict & RT6_LOOKUP_F_IFACE))
521 #ifdef CONFIG_IPV6_ROUTER_PREF
522 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
524 n = rt6_check_neigh(rt);
525 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
530 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
531 int *mpri, struct rt6_info *match)
535 if (rt6_check_expired(rt))
538 m = rt6_score_route(rt, oif, strict);
543 if (strict & RT6_LOOKUP_F_REACHABLE)
547 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
555 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
556 struct rt6_info *rr_head,
557 u32 metric, int oif, int strict)
559 struct rt6_info *rt, *match;
563 for (rt = rr_head; rt && rt->rt6i_metric == metric;
564 rt = rt->dst.rt6_next)
565 match = find_match(rt, oif, strict, &mpri, match);
566 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
567 rt = rt->dst.rt6_next)
568 match = find_match(rt, oif, strict, &mpri, match);
573 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
575 struct rt6_info *match, *rt0;
580 fn->rr_ptr = rt0 = fn->leaf;
582 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
585 (strict & RT6_LOOKUP_F_REACHABLE)) {
586 struct rt6_info *next = rt0->dst.rt6_next;
588 /* no entries matched; do round-robin */
589 if (!next || next->rt6i_metric != rt0->rt6i_metric)
596 net = dev_net(rt0->dst.dev);
597 return match ? match : net->ipv6.ip6_null_entry;
600 #ifdef CONFIG_IPV6_ROUTE_INFO
601 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
602 const struct in6_addr *gwaddr)
604 struct net *net = dev_net(dev);
605 struct route_info *rinfo = (struct route_info *) opt;
606 struct in6_addr prefix_buf, *prefix;
608 unsigned long lifetime;
611 if (len < sizeof(struct route_info)) {
615 /* Sanity check for prefix_len and length */
616 if (rinfo->length > 3) {
618 } else if (rinfo->prefix_len > 128) {
620 } else if (rinfo->prefix_len > 64) {
621 if (rinfo->length < 2) {
624 } else if (rinfo->prefix_len > 0) {
625 if (rinfo->length < 1) {
630 pref = rinfo->route_pref;
631 if (pref == ICMPV6_ROUTER_PREF_INVALID)
634 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
636 if (rinfo->length == 3)
637 prefix = (struct in6_addr *)rinfo->prefix;
639 /* this function is safe */
640 ipv6_addr_prefix(&prefix_buf,
641 (struct in6_addr *)rinfo->prefix,
643 prefix = &prefix_buf;
646 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
649 if (rt && !lifetime) {
655 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
658 rt->rt6i_flags = RTF_ROUTEINFO |
659 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
662 if (!addrconf_finite_timeout(lifetime))
663 rt6_clean_expires(rt);
665 rt6_set_expires(rt, jiffies + HZ * lifetime);
667 dst_release(&rt->dst);
673 #define BACKTRACK(__net, saddr) \
675 if (rt == __net->ipv6.ip6_null_entry) { \
676 struct fib6_node *pn; \
678 if (fn->fn_flags & RTN_TL_ROOT) \
681 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
682 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
685 if (fn->fn_flags & RTN_RTINFO) \
691 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
692 struct fib6_table *table,
693 struct flowi6 *fl6, int flags)
695 struct fib6_node *fn;
698 read_lock_bh(&table->tb6_lock);
699 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
702 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
703 BACKTRACK(net, &fl6->saddr);
705 dst_use(&rt->dst, jiffies);
706 read_unlock_bh(&table->tb6_lock);
711 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
714 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
716 EXPORT_SYMBOL_GPL(ip6_route_lookup);
718 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
719 const struct in6_addr *saddr, int oif, int strict)
721 struct flowi6 fl6 = {
725 struct dst_entry *dst;
726 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
729 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
730 flags |= RT6_LOOKUP_F_HAS_SADDR;
733 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
735 return (struct rt6_info *) dst;
742 EXPORT_SYMBOL(rt6_lookup);
744 /* ip6_ins_rt is called with FREE table->tb6_lock.
745 It takes new route entry, the addition fails by any reason the
746 route is freed. In any case, if caller does not hold it, it may
750 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
753 struct fib6_table *table;
755 table = rt->rt6i_table;
756 write_lock_bh(&table->tb6_lock);
757 err = fib6_add(&table->tb6_root, rt, info);
758 write_unlock_bh(&table->tb6_lock);
763 int ip6_ins_rt(struct rt6_info *rt)
765 struct nl_info info = {
766 .nl_net = dev_net(rt->dst.dev),
768 return __ip6_ins_rt(rt, &info);
771 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
772 const struct in6_addr *daddr,
773 const struct in6_addr *saddr)
781 rt = ip6_rt_copy(ort, daddr);
784 int attempts = !in_softirq();
786 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
787 if (ort->rt6i_dst.plen != 128 &&
788 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
789 rt->rt6i_flags |= RTF_ANYCAST;
790 rt->rt6i_gateway = *daddr;
793 rt->rt6i_flags |= RTF_CACHE;
795 #ifdef CONFIG_IPV6_SUBTREES
796 if (rt->rt6i_src.plen && saddr) {
797 rt->rt6i_src.addr = *saddr;
798 rt->rt6i_src.plen = 128;
803 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
804 struct net *net = dev_net(rt->dst.dev);
805 int saved_rt_min_interval =
806 net->ipv6.sysctl.ip6_rt_gc_min_interval;
807 int saved_rt_elasticity =
808 net->ipv6.sysctl.ip6_rt_gc_elasticity;
810 if (attempts-- > 0) {
811 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
812 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
814 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
816 net->ipv6.sysctl.ip6_rt_gc_elasticity =
818 net->ipv6.sysctl.ip6_rt_gc_min_interval =
819 saved_rt_min_interval;
823 net_warn_ratelimited("Neighbour table overflow\n");
832 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
833 const struct in6_addr *daddr)
835 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
838 rt->rt6i_flags |= RTF_CACHE;
839 rt->n = neigh_clone(ort->n);
844 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
845 struct flowi6 *fl6, int flags)
847 struct fib6_node *fn;
848 struct rt6_info *rt, *nrt;
852 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
854 strict |= flags & RT6_LOOKUP_F_IFACE;
857 read_lock_bh(&table->tb6_lock);
860 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
863 rt = rt6_select(fn, oif, strict | reachable);
865 BACKTRACK(net, &fl6->saddr);
866 if (rt == net->ipv6.ip6_null_entry ||
867 rt->rt6i_flags & RTF_CACHE)
871 read_unlock_bh(&table->tb6_lock);
873 if (!rt->n && !(rt->rt6i_flags & RTF_NONEXTHOP))
874 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
875 else if (!(rt->dst.flags & DST_HOST))
876 nrt = rt6_alloc_clone(rt, &fl6->daddr);
880 dst_release(&rt->dst);
881 rt = nrt ? : net->ipv6.ip6_null_entry;
885 err = ip6_ins_rt(nrt);
894 * Race condition! In the gap, when table->tb6_lock was
895 * released someone could insert this route. Relookup.
897 dst_release(&rt->dst);
906 read_unlock_bh(&table->tb6_lock);
908 rt->dst.lastuse = jiffies;
914 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
915 struct flowi6 *fl6, int flags)
917 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
920 static struct dst_entry *ip6_route_input_lookup(struct net *net,
921 struct net_device *dev,
922 struct flowi6 *fl6, int flags)
924 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
925 flags |= RT6_LOOKUP_F_IFACE;
927 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
930 void ip6_route_input(struct sk_buff *skb)
932 const struct ipv6hdr *iph = ipv6_hdr(skb);
933 struct net *net = dev_net(skb->dev);
934 int flags = RT6_LOOKUP_F_HAS_SADDR;
935 struct flowi6 fl6 = {
936 .flowi6_iif = skb->dev->ifindex,
939 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
940 .flowi6_mark = skb->mark,
941 .flowi6_proto = iph->nexthdr,
944 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
947 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
948 struct flowi6 *fl6, int flags)
950 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
953 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
958 fl6->flowi6_iif = net->loopback_dev->ifindex;
960 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
961 flags |= RT6_LOOKUP_F_IFACE;
963 if (!ipv6_addr_any(&fl6->saddr))
964 flags |= RT6_LOOKUP_F_HAS_SADDR;
966 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
968 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
971 EXPORT_SYMBOL(ip6_route_output);
973 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
975 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
976 struct dst_entry *new = NULL;
978 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
980 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
981 rt6_init_peer(rt, net->ipv6.peers);
986 new->input = dst_discard;
987 new->output = dst_discard;
989 if (dst_metrics_read_only(&ort->dst))
990 new->_metrics = ort->dst._metrics;
992 dst_copy_metrics(new, &ort->dst);
993 rt->rt6i_idev = ort->rt6i_idev;
995 in6_dev_hold(rt->rt6i_idev);
997 rt->rt6i_gateway = ort->rt6i_gateway;
998 rt->rt6i_flags = ort->rt6i_flags;
999 rt6_clean_expires(rt);
1000 rt->rt6i_metric = 0;
1002 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1003 #ifdef CONFIG_IPV6_SUBTREES
1004 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1010 dst_release(dst_orig);
1011 return new ? new : ERR_PTR(-ENOMEM);
1015 * Destination cache support functions
1018 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1020 struct rt6_info *rt;
1022 rt = (struct rt6_info *) dst;
1024 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1025 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1026 if (!rt6_has_peer(rt))
1027 rt6_bind_peer(rt, 0);
1028 rt->rt6i_peer_genid = rt6_peer_genid();
1035 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1037 struct rt6_info *rt = (struct rt6_info *) dst;
1040 if (rt->rt6i_flags & RTF_CACHE) {
1041 if (rt6_check_expired(rt)) {
1053 static void ip6_link_failure(struct sk_buff *skb)
1055 struct rt6_info *rt;
1057 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1059 rt = (struct rt6_info *) skb_dst(skb);
1061 if (rt->rt6i_flags & RTF_CACHE)
1062 rt6_update_expires(rt, 0);
1063 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1064 rt->rt6i_node->fn_sernum = -1;
1068 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1070 struct rt6_info *rt6 = (struct rt6_info*)dst;
1073 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1074 struct net *net = dev_net(dst->dev);
1076 rt6->rt6i_flags |= RTF_MODIFIED;
1077 if (mtu < IPV6_MIN_MTU) {
1078 u32 features = dst_metric(dst, RTAX_FEATURES);
1080 features |= RTAX_FEATURE_ALLFRAG;
1081 dst_metric_set(dst, RTAX_FEATURES, features);
1083 dst_metric_set(dst, RTAX_MTU, mtu);
1084 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1088 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1091 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1092 struct dst_entry *dst;
1095 memset(&fl6, 0, sizeof(fl6));
1096 fl6.flowi6_oif = oif;
1097 fl6.flowi6_mark = mark;
1098 fl6.flowi6_flags = 0;
1099 fl6.daddr = iph->daddr;
1100 fl6.saddr = iph->saddr;
1101 fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1103 dst = ip6_route_output(net, NULL, &fl6);
1105 ip6_rt_update_pmtu(dst, ntohl(mtu));
1108 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1110 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1112 ip6_update_pmtu(skb, sock_net(sk), mtu,
1113 sk->sk_bound_dev_if, sk->sk_mark);
1115 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1117 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1119 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1120 struct dst_entry *dst;
1123 memset(&fl6, 0, sizeof(fl6));
1124 fl6.flowi6_oif = oif;
1125 fl6.flowi6_mark = mark;
1126 fl6.flowi6_flags = 0;
1127 fl6.daddr = iph->daddr;
1128 fl6.saddr = iph->saddr;
1129 fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1131 dst = ip6_route_output(net, NULL, &fl6);
1133 rt6_do_redirect(dst, skb);
1136 EXPORT_SYMBOL_GPL(ip6_redirect);
1138 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1140 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1142 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1144 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1146 struct net_device *dev = dst->dev;
1147 unsigned int mtu = dst_mtu(dst);
1148 struct net *net = dev_net(dev);
1150 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1152 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1153 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1156 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1157 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1158 * IPV6_MAXPLEN is also valid and means: "any MSS,
1159 * rely only on pmtu discovery"
1161 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1166 static unsigned int ip6_mtu(const struct dst_entry *dst)
1168 struct inet6_dev *idev;
1169 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1177 idev = __in6_dev_get(dst->dev);
1179 mtu = idev->cnf.mtu6;
1185 static struct dst_entry *icmp6_dst_gc_list;
1186 static DEFINE_SPINLOCK(icmp6_dst_lock);
1188 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1189 struct neighbour *neigh,
1192 struct dst_entry *dst;
1193 struct rt6_info *rt;
1194 struct inet6_dev *idev = in6_dev_get(dev);
1195 struct net *net = dev_net(dev);
1197 if (unlikely(!idev))
1198 return ERR_PTR(-ENODEV);
1200 rt = ip6_dst_alloc(net, dev, 0, NULL);
1201 if (unlikely(!rt)) {
1203 dst = ERR_PTR(-ENOMEM);
1210 neigh = ip6_neigh_lookup(&rt->dst, NULL, &fl6->daddr);
1211 if (IS_ERR(neigh)) {
1214 return ERR_CAST(neigh);
1218 rt->dst.flags |= DST_HOST;
1219 rt->dst.output = ip6_output;
1221 atomic_set(&rt->dst.__refcnt, 1);
1222 rt->rt6i_dst.addr = fl6->daddr;
1223 rt->rt6i_dst.plen = 128;
1224 rt->rt6i_idev = idev;
1225 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1227 spin_lock_bh(&icmp6_dst_lock);
1228 rt->dst.next = icmp6_dst_gc_list;
1229 icmp6_dst_gc_list = &rt->dst;
1230 spin_unlock_bh(&icmp6_dst_lock);
1232 fib6_force_start_gc(net);
1234 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1240 int icmp6_dst_gc(void)
1242 struct dst_entry *dst, **pprev;
1245 spin_lock_bh(&icmp6_dst_lock);
1246 pprev = &icmp6_dst_gc_list;
1248 while ((dst = *pprev) != NULL) {
1249 if (!atomic_read(&dst->__refcnt)) {
1258 spin_unlock_bh(&icmp6_dst_lock);
1263 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1266 struct dst_entry *dst, **pprev;
1268 spin_lock_bh(&icmp6_dst_lock);
1269 pprev = &icmp6_dst_gc_list;
1270 while ((dst = *pprev) != NULL) {
1271 struct rt6_info *rt = (struct rt6_info *) dst;
1272 if (func(rt, arg)) {
1279 spin_unlock_bh(&icmp6_dst_lock);
1282 static int ip6_dst_gc(struct dst_ops *ops)
1284 unsigned long now = jiffies;
1285 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1286 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1287 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1288 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1289 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1290 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1293 entries = dst_entries_get_fast(ops);
1294 if (time_after(rt_last_gc + rt_min_interval, now) &&
1295 entries <= rt_max_size)
1298 net->ipv6.ip6_rt_gc_expire++;
1299 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1300 net->ipv6.ip6_rt_last_gc = now;
1301 entries = dst_entries_get_slow(ops);
1302 if (entries < ops->gc_thresh)
1303 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1305 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1306 return entries > rt_max_size;
1309 /* Clean host part of a prefix. Not necessary in radix tree,
1310 but results in cleaner routing tables.
1312 Remove it only when all the things will work!
1315 int ip6_dst_hoplimit(struct dst_entry *dst)
1317 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1318 if (hoplimit == 0) {
1319 struct net_device *dev = dst->dev;
1320 struct inet6_dev *idev;
1323 idev = __in6_dev_get(dev);
1325 hoplimit = idev->cnf.hop_limit;
1327 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1332 EXPORT_SYMBOL(ip6_dst_hoplimit);
1338 int ip6_route_add(struct fib6_config *cfg)
1341 struct net *net = cfg->fc_nlinfo.nl_net;
1342 struct rt6_info *rt = NULL;
1343 struct net_device *dev = NULL;
1344 struct inet6_dev *idev = NULL;
1345 struct fib6_table *table;
1348 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1350 #ifndef CONFIG_IPV6_SUBTREES
1351 if (cfg->fc_src_len)
1354 if (cfg->fc_ifindex) {
1356 dev = dev_get_by_index(net, cfg->fc_ifindex);
1359 idev = in6_dev_get(dev);
1364 if (cfg->fc_metric == 0)
1365 cfg->fc_metric = IP6_RT_PRIO_USER;
1368 if (cfg->fc_nlinfo.nlh &&
1369 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1370 table = fib6_get_table(net, cfg->fc_table);
1372 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1373 table = fib6_new_table(net, cfg->fc_table);
1376 table = fib6_new_table(net, cfg->fc_table);
1382 rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1389 rt->dst.obsolete = -1;
1391 if (cfg->fc_flags & RTF_EXPIRES)
1392 rt6_set_expires(rt, jiffies +
1393 clock_t_to_jiffies(cfg->fc_expires));
1395 rt6_clean_expires(rt);
1397 if (cfg->fc_protocol == RTPROT_UNSPEC)
1398 cfg->fc_protocol = RTPROT_BOOT;
1399 rt->rt6i_protocol = cfg->fc_protocol;
1401 addr_type = ipv6_addr_type(&cfg->fc_dst);
1403 if (addr_type & IPV6_ADDR_MULTICAST)
1404 rt->dst.input = ip6_mc_input;
1405 else if (cfg->fc_flags & RTF_LOCAL)
1406 rt->dst.input = ip6_input;
1408 rt->dst.input = ip6_forward;
1410 rt->dst.output = ip6_output;
1412 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1413 rt->rt6i_dst.plen = cfg->fc_dst_len;
1414 if (rt->rt6i_dst.plen == 128)
1415 rt->dst.flags |= DST_HOST;
1417 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1418 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1423 dst_init_metrics(&rt->dst, metrics, 0);
1425 #ifdef CONFIG_IPV6_SUBTREES
1426 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1427 rt->rt6i_src.plen = cfg->fc_src_len;
1430 rt->rt6i_metric = cfg->fc_metric;
1432 /* We cannot add true routes via loopback here,
1433 they would result in kernel looping; promote them to reject routes
1435 if ((cfg->fc_flags & RTF_REJECT) ||
1436 (dev && (dev->flags & IFF_LOOPBACK) &&
1437 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1438 !(cfg->fc_flags & RTF_LOCAL))) {
1439 /* hold loopback dev/idev if we haven't done so. */
1440 if (dev != net->loopback_dev) {
1445 dev = net->loopback_dev;
1447 idev = in6_dev_get(dev);
1453 rt->dst.output = ip6_pkt_discard_out;
1454 rt->dst.input = ip6_pkt_discard;
1455 rt->dst.error = -ENETUNREACH;
1456 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1460 if (cfg->fc_flags & RTF_GATEWAY) {
1461 const struct in6_addr *gw_addr;
1464 gw_addr = &cfg->fc_gateway;
1465 rt->rt6i_gateway = *gw_addr;
1466 gwa_type = ipv6_addr_type(gw_addr);
1468 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1469 struct rt6_info *grt;
1471 /* IPv6 strictly inhibits using not link-local
1472 addresses as nexthop address.
1473 Otherwise, router will not able to send redirects.
1474 It is very good, but in some (rare!) circumstances
1475 (SIT, PtP, NBMA NOARP links) it is handy to allow
1476 some exceptions. --ANK
1479 if (!(gwa_type & IPV6_ADDR_UNICAST))
1482 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1484 err = -EHOSTUNREACH;
1488 if (dev != grt->dst.dev) {
1489 dst_release(&grt->dst);
1494 idev = grt->rt6i_idev;
1496 in6_dev_hold(grt->rt6i_idev);
1498 if (!(grt->rt6i_flags & RTF_GATEWAY))
1500 dst_release(&grt->dst);
1506 if (!dev || (dev->flags & IFF_LOOPBACK))
1514 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1515 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1519 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1520 rt->rt6i_prefsrc.plen = 128;
1522 rt->rt6i_prefsrc.plen = 0;
1524 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1525 err = rt6_bind_neighbour(rt, dev);
1530 rt->rt6i_flags = cfg->fc_flags;
1537 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1538 int type = nla_type(nla);
1541 if (type > RTAX_MAX) {
1546 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1552 rt->rt6i_idev = idev;
1553 rt->rt6i_table = table;
1555 cfg->fc_nlinfo.nl_net = dev_net(dev);
1557 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1569 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1572 struct fib6_table *table;
1573 struct net *net = dev_net(rt->dst.dev);
1575 if (rt == net->ipv6.ip6_null_entry)
1578 table = rt->rt6i_table;
1579 write_lock_bh(&table->tb6_lock);
1581 err = fib6_del(rt, info);
1582 dst_release(&rt->dst);
1584 write_unlock_bh(&table->tb6_lock);
1589 int ip6_del_rt(struct rt6_info *rt)
1591 struct nl_info info = {
1592 .nl_net = dev_net(rt->dst.dev),
1594 return __ip6_del_rt(rt, &info);
1597 static int ip6_route_del(struct fib6_config *cfg)
1599 struct fib6_table *table;
1600 struct fib6_node *fn;
1601 struct rt6_info *rt;
1604 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1608 read_lock_bh(&table->tb6_lock);
1610 fn = fib6_locate(&table->tb6_root,
1611 &cfg->fc_dst, cfg->fc_dst_len,
1612 &cfg->fc_src, cfg->fc_src_len);
1615 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1616 if (cfg->fc_ifindex &&
1618 rt->dst.dev->ifindex != cfg->fc_ifindex))
1620 if (cfg->fc_flags & RTF_GATEWAY &&
1621 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1623 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1626 read_unlock_bh(&table->tb6_lock);
1628 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1631 read_unlock_bh(&table->tb6_lock);
1639 struct ip6rd_flowi {
1641 struct in6_addr gateway;
1644 static struct rt6_info *__ip6_route_redirect(struct net *net,
1645 struct fib6_table *table,
1649 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1650 struct rt6_info *rt;
1651 struct fib6_node *fn;
1654 * Get the "current" route for this destination and
1655 * check if the redirect has come from approriate router.
1657 * RFC 2461 specifies that redirects should only be
1658 * accepted if they come from the nexthop to the target.
1659 * Due to the way the routes are chosen, this notion
1660 * is a bit fuzzy and one might need to check all possible
1664 read_lock_bh(&table->tb6_lock);
1665 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1667 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1669 * Current route is on-link; redirect is always invalid.
1671 * Seems, previous statement is not true. It could
1672 * be node, which looks for us as on-link (f.e. proxy ndisc)
1673 * But then router serving it might decide, that we should
1674 * know truth 8)8) --ANK (980726).
1676 if (rt6_check_expired(rt))
1678 if (!(rt->rt6i_flags & RTF_GATEWAY))
1680 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1682 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1688 rt = net->ipv6.ip6_null_entry;
1689 BACKTRACK(net, &fl6->saddr);
1693 read_unlock_bh(&table->tb6_lock);
1698 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1699 const struct in6_addr *src,
1700 const struct in6_addr *gateway,
1701 struct net_device *dev)
1703 int flags = RT6_LOOKUP_F_HAS_SADDR;
1704 struct net *net = dev_net(dev);
1705 struct ip6rd_flowi rdfl = {
1707 .flowi6_oif = dev->ifindex,
1713 rdfl.gateway = *gateway;
1715 if (rt6_need_strict(dest))
1716 flags |= RT6_LOOKUP_F_IFACE;
1718 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1719 flags, __ip6_route_redirect);
1722 static void rt6_do_redirect(struct dst_entry *dst, struct sk_buff *skb)
1724 struct net *net = dev_net(skb->dev);
1725 struct netevent_redirect netevent;
1726 struct rt6_info *rt, *nrt = NULL;
1727 const struct in6_addr *target;
1728 struct ndisc_options ndopts;
1729 const struct in6_addr *dest;
1730 struct neighbour *old_neigh;
1731 struct inet6_dev *in6_dev;
1732 struct neighbour *neigh;
1733 struct icmp6hdr *icmph;
1734 int optlen, on_link;
1737 optlen = skb->tail - skb->transport_header;
1738 optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr);
1741 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
1745 icmph = icmp6_hdr(skb);
1746 target = (const struct in6_addr *) (icmph + 1);
1749 if (ipv6_addr_is_multicast(dest)) {
1750 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
1755 if (ipv6_addr_equal(dest, target)) {
1757 } else if (ipv6_addr_type(target) !=
1758 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
1759 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
1763 in6_dev = __in6_dev_get(skb->dev);
1766 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
1770 * The IP source address of the Redirect MUST be the same as the current
1771 * first-hop router for the specified ICMP Destination Address.
1774 if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) {
1775 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
1780 if (ndopts.nd_opts_tgt_lladdr) {
1781 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
1784 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
1789 rt = (struct rt6_info *) dst;
1790 if (rt == net->ipv6.ip6_null_entry) {
1791 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1795 /* Redirect received -> path was valid.
1796 * Look, redirects are sent only in response to data packets,
1797 * so that this nexthop apparently is reachable. --ANK
1799 dst_confirm(&rt->dst);
1801 neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
1805 /* Duplicate redirect: silently ignore. */
1807 if (neigh == old_neigh)
1811 * We have finally decided to accept it.
1814 neigh_update(neigh, lladdr, NUD_STALE,
1815 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1816 NEIGH_UPDATE_F_OVERRIDE|
1817 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1818 NEIGH_UPDATE_F_ISROUTER))
1821 nrt = ip6_rt_copy(rt, dest);
1825 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1827 nrt->rt6i_flags &= ~RTF_GATEWAY;
1829 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1830 nrt->n = neigh_clone(neigh);
1832 if (ip6_ins_rt(nrt))
1835 netevent.old = &rt->dst;
1836 netevent.old_neigh = old_neigh;
1837 netevent.new = &nrt->dst;
1838 netevent.new_neigh = neigh;
1839 netevent.daddr = dest;
1840 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1842 if (rt->rt6i_flags & RTF_CACHE) {
1843 rt = (struct rt6_info *) dst_clone(&rt->dst);
1848 neigh_release(neigh);
1851 void rt6_redirect(struct sk_buff *skb)
1853 const struct in6_addr *target;
1854 const struct in6_addr *dest;
1855 const struct in6_addr *src;
1856 const struct in6_addr *saddr;
1857 struct icmp6hdr *icmph;
1858 struct rt6_info *rt;
1860 icmph = icmp6_hdr(skb);
1861 target = (const struct in6_addr *) (icmph + 1);
1864 src = &ipv6_hdr(skb)->daddr;
1865 saddr = &ipv6_hdr(skb)->saddr;
1867 rt = ip6_route_redirect(dest, src, saddr, skb->dev);
1868 rt6_do_redirect(&rt->dst, skb);
1869 dst_release(&rt->dst);
1873 * Misc support functions
1876 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1877 const struct in6_addr *dest)
1879 struct net *net = dev_net(ort->dst.dev);
1880 struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1884 rt->dst.input = ort->dst.input;
1885 rt->dst.output = ort->dst.output;
1886 rt->dst.flags |= DST_HOST;
1888 rt->rt6i_dst.addr = *dest;
1889 rt->rt6i_dst.plen = 128;
1890 dst_copy_metrics(&rt->dst, &ort->dst);
1891 rt->dst.error = ort->dst.error;
1892 rt->rt6i_idev = ort->rt6i_idev;
1894 in6_dev_hold(rt->rt6i_idev);
1895 rt->dst.lastuse = jiffies;
1897 rt->rt6i_gateway = ort->rt6i_gateway;
1898 rt->rt6i_flags = ort->rt6i_flags;
1899 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1900 (RTF_DEFAULT | RTF_ADDRCONF))
1901 rt6_set_from(rt, ort);
1903 rt6_clean_expires(rt);
1904 rt->rt6i_metric = 0;
1906 #ifdef CONFIG_IPV6_SUBTREES
1907 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1909 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1910 rt->rt6i_table = ort->rt6i_table;
1915 #ifdef CONFIG_IPV6_ROUTE_INFO
1916 static struct rt6_info *rt6_get_route_info(struct net *net,
1917 const struct in6_addr *prefix, int prefixlen,
1918 const struct in6_addr *gwaddr, int ifindex)
1920 struct fib6_node *fn;
1921 struct rt6_info *rt = NULL;
1922 struct fib6_table *table;
1924 table = fib6_get_table(net, RT6_TABLE_INFO);
1928 write_lock_bh(&table->tb6_lock);
1929 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1933 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1934 if (rt->dst.dev->ifindex != ifindex)
1936 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1938 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1944 write_unlock_bh(&table->tb6_lock);
1948 static struct rt6_info *rt6_add_route_info(struct net *net,
1949 const struct in6_addr *prefix, int prefixlen,
1950 const struct in6_addr *gwaddr, int ifindex,
1953 struct fib6_config cfg = {
1954 .fc_table = RT6_TABLE_INFO,
1955 .fc_metric = IP6_RT_PRIO_USER,
1956 .fc_ifindex = ifindex,
1957 .fc_dst_len = prefixlen,
1958 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1959 RTF_UP | RTF_PREF(pref),
1961 .fc_nlinfo.nlh = NULL,
1962 .fc_nlinfo.nl_net = net,
1965 cfg.fc_dst = *prefix;
1966 cfg.fc_gateway = *gwaddr;
1968 /* We should treat it as a default route if prefix length is 0. */
1970 cfg.fc_flags |= RTF_DEFAULT;
1972 ip6_route_add(&cfg);
1974 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1978 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1980 struct rt6_info *rt;
1981 struct fib6_table *table;
1983 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1987 write_lock_bh(&table->tb6_lock);
1988 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1989 if (dev == rt->dst.dev &&
1990 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1991 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1996 write_unlock_bh(&table->tb6_lock);
2000 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2001 struct net_device *dev,
2004 struct fib6_config cfg = {
2005 .fc_table = RT6_TABLE_DFLT,
2006 .fc_metric = IP6_RT_PRIO_USER,
2007 .fc_ifindex = dev->ifindex,
2008 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2009 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2011 .fc_nlinfo.nlh = NULL,
2012 .fc_nlinfo.nl_net = dev_net(dev),
2015 cfg.fc_gateway = *gwaddr;
2017 ip6_route_add(&cfg);
2019 return rt6_get_dflt_router(gwaddr, dev);
2022 void rt6_purge_dflt_routers(struct net *net)
2024 struct rt6_info *rt;
2025 struct fib6_table *table;
2027 /* NOTE: Keep consistent with rt6_get_dflt_router */
2028 table = fib6_get_table(net, RT6_TABLE_DFLT);
2033 read_lock_bh(&table->tb6_lock);
2034 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2035 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
2037 read_unlock_bh(&table->tb6_lock);
2042 read_unlock_bh(&table->tb6_lock);
2045 static void rtmsg_to_fib6_config(struct net *net,
2046 struct in6_rtmsg *rtmsg,
2047 struct fib6_config *cfg)
2049 memset(cfg, 0, sizeof(*cfg));
2051 cfg->fc_table = RT6_TABLE_MAIN;
2052 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2053 cfg->fc_metric = rtmsg->rtmsg_metric;
2054 cfg->fc_expires = rtmsg->rtmsg_info;
2055 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2056 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2057 cfg->fc_flags = rtmsg->rtmsg_flags;
2059 cfg->fc_nlinfo.nl_net = net;
2061 cfg->fc_dst = rtmsg->rtmsg_dst;
2062 cfg->fc_src = rtmsg->rtmsg_src;
2063 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2066 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2068 struct fib6_config cfg;
2069 struct in6_rtmsg rtmsg;
2073 case SIOCADDRT: /* Add a route */
2074 case SIOCDELRT: /* Delete a route */
2075 if (!capable(CAP_NET_ADMIN))
2077 err = copy_from_user(&rtmsg, arg,
2078 sizeof(struct in6_rtmsg));
2082 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2087 err = ip6_route_add(&cfg);
2090 err = ip6_route_del(&cfg);
2104 * Drop the packet on the floor
2107 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2110 struct dst_entry *dst = skb_dst(skb);
2111 switch (ipstats_mib_noroutes) {
2112 case IPSTATS_MIB_INNOROUTES:
2113 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2114 if (type == IPV6_ADDR_ANY) {
2115 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2116 IPSTATS_MIB_INADDRERRORS);
2120 case IPSTATS_MIB_OUTNOROUTES:
2121 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2122 ipstats_mib_noroutes);
2125 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2130 static int ip6_pkt_discard(struct sk_buff *skb)
2132 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2135 static int ip6_pkt_discard_out(struct sk_buff *skb)
2137 skb->dev = skb_dst(skb)->dev;
2138 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2141 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2143 static int ip6_pkt_prohibit(struct sk_buff *skb)
2145 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2148 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2150 skb->dev = skb_dst(skb)->dev;
2151 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2157 * Allocate a dst for local (unicast / anycast) address.
2160 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2161 const struct in6_addr *addr,
2164 struct net *net = dev_net(idev->dev);
2165 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2169 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2170 return ERR_PTR(-ENOMEM);
2175 rt->dst.flags |= DST_HOST;
2176 rt->dst.input = ip6_input;
2177 rt->dst.output = ip6_output;
2178 rt->rt6i_idev = idev;
2179 rt->dst.obsolete = -1;
2181 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2183 rt->rt6i_flags |= RTF_ANYCAST;
2185 rt->rt6i_flags |= RTF_LOCAL;
2186 err = rt6_bind_neighbour(rt, rt->dst.dev);
2189 return ERR_PTR(err);
2192 rt->rt6i_dst.addr = *addr;
2193 rt->rt6i_dst.plen = 128;
2194 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2196 atomic_set(&rt->dst.__refcnt, 1);
2201 int ip6_route_get_saddr(struct net *net,
2202 struct rt6_info *rt,
2203 const struct in6_addr *daddr,
2205 struct in6_addr *saddr)
2207 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2209 if (rt->rt6i_prefsrc.plen)
2210 *saddr = rt->rt6i_prefsrc.addr;
2212 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2213 daddr, prefs, saddr);
2217 /* remove deleted ip from prefsrc entries */
2218 struct arg_dev_net_ip {
2219 struct net_device *dev;
2221 struct in6_addr *addr;
2224 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2226 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2227 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2228 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2230 if (((void *)rt->dst.dev == dev || !dev) &&
2231 rt != net->ipv6.ip6_null_entry &&
2232 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2233 /* remove prefsrc entry */
2234 rt->rt6i_prefsrc.plen = 0;
2239 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2241 struct net *net = dev_net(ifp->idev->dev);
2242 struct arg_dev_net_ip adni = {
2243 .dev = ifp->idev->dev,
2247 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2250 struct arg_dev_net {
2251 struct net_device *dev;
2255 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2257 const struct arg_dev_net *adn = arg;
2258 const struct net_device *dev = adn->dev;
2260 if ((rt->dst.dev == dev || !dev) &&
2261 rt != adn->net->ipv6.ip6_null_entry)
2267 void rt6_ifdown(struct net *net, struct net_device *dev)
2269 struct arg_dev_net adn = {
2274 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2275 icmp6_clean_all(fib6_ifdown, &adn);
2278 struct rt6_mtu_change_arg {
2279 struct net_device *dev;
2283 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2285 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2286 struct inet6_dev *idev;
2288 /* In IPv6 pmtu discovery is not optional,
2289 so that RTAX_MTU lock cannot disable it.
2290 We still use this lock to block changes
2291 caused by addrconf/ndisc.
2294 idev = __in6_dev_get(arg->dev);
2298 /* For administrative MTU increase, there is no way to discover
2299 IPv6 PMTU increase, so PMTU increase should be updated here.
2300 Since RFC 1981 doesn't include administrative MTU increase
2301 update PMTU increase is a MUST. (i.e. jumbo frame)
2304 If new MTU is less than route PMTU, this new MTU will be the
2305 lowest MTU in the path, update the route PMTU to reflect PMTU
2306 decreases; if new MTU is greater than route PMTU, and the
2307 old MTU is the lowest MTU in the path, update the route PMTU
2308 to reflect the increase. In this case if the other nodes' MTU
2309 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2312 if (rt->dst.dev == arg->dev &&
2313 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2314 (dst_mtu(&rt->dst) >= arg->mtu ||
2315 (dst_mtu(&rt->dst) < arg->mtu &&
2316 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2317 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2322 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2324 struct rt6_mtu_change_arg arg = {
2329 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2332 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2333 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2334 [RTA_OIF] = { .type = NLA_U32 },
2335 [RTA_IIF] = { .type = NLA_U32 },
2336 [RTA_PRIORITY] = { .type = NLA_U32 },
2337 [RTA_METRICS] = { .type = NLA_NESTED },
2340 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2341 struct fib6_config *cfg)
2344 struct nlattr *tb[RTA_MAX+1];
2347 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2352 rtm = nlmsg_data(nlh);
2353 memset(cfg, 0, sizeof(*cfg));
2355 cfg->fc_table = rtm->rtm_table;
2356 cfg->fc_dst_len = rtm->rtm_dst_len;
2357 cfg->fc_src_len = rtm->rtm_src_len;
2358 cfg->fc_flags = RTF_UP;
2359 cfg->fc_protocol = rtm->rtm_protocol;
2361 if (rtm->rtm_type == RTN_UNREACHABLE)
2362 cfg->fc_flags |= RTF_REJECT;
2364 if (rtm->rtm_type == RTN_LOCAL)
2365 cfg->fc_flags |= RTF_LOCAL;
2367 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2368 cfg->fc_nlinfo.nlh = nlh;
2369 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2371 if (tb[RTA_GATEWAY]) {
2372 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2373 cfg->fc_flags |= RTF_GATEWAY;
2377 int plen = (rtm->rtm_dst_len + 7) >> 3;
2379 if (nla_len(tb[RTA_DST]) < plen)
2382 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2386 int plen = (rtm->rtm_src_len + 7) >> 3;
2388 if (nla_len(tb[RTA_SRC]) < plen)
2391 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2394 if (tb[RTA_PREFSRC])
2395 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2398 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2400 if (tb[RTA_PRIORITY])
2401 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2403 if (tb[RTA_METRICS]) {
2404 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2405 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2409 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2416 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2418 struct fib6_config cfg;
2421 err = rtm_to_fib6_config(skb, nlh, &cfg);
2425 return ip6_route_del(&cfg);
2428 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2430 struct fib6_config cfg;
2433 err = rtm_to_fib6_config(skb, nlh, &cfg);
2437 return ip6_route_add(&cfg);
2440 static inline size_t rt6_nlmsg_size(void)
2442 return NLMSG_ALIGN(sizeof(struct rtmsg))
2443 + nla_total_size(16) /* RTA_SRC */
2444 + nla_total_size(16) /* RTA_DST */
2445 + nla_total_size(16) /* RTA_GATEWAY */
2446 + nla_total_size(16) /* RTA_PREFSRC */
2447 + nla_total_size(4) /* RTA_TABLE */
2448 + nla_total_size(4) /* RTA_IIF */
2449 + nla_total_size(4) /* RTA_OIF */
2450 + nla_total_size(4) /* RTA_PRIORITY */
2451 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2452 + nla_total_size(sizeof(struct rta_cacheinfo));
2455 static int rt6_fill_node(struct net *net,
2456 struct sk_buff *skb, struct rt6_info *rt,
2457 struct in6_addr *dst, struct in6_addr *src,
2458 int iif, int type, u32 pid, u32 seq,
2459 int prefix, int nowait, unsigned int flags)
2462 struct nlmsghdr *nlh;
2465 struct neighbour *n;
2467 if (prefix) { /* user wants prefix routes only */
2468 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2469 /* success since this is not a prefix route */
2474 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2478 rtm = nlmsg_data(nlh);
2479 rtm->rtm_family = AF_INET6;
2480 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2481 rtm->rtm_src_len = rt->rt6i_src.plen;
2484 table = rt->rt6i_table->tb6_id;
2486 table = RT6_TABLE_UNSPEC;
2487 rtm->rtm_table = table;
2488 if (nla_put_u32(skb, RTA_TABLE, table))
2489 goto nla_put_failure;
2490 if (rt->rt6i_flags & RTF_REJECT)
2491 rtm->rtm_type = RTN_UNREACHABLE;
2492 else if (rt->rt6i_flags & RTF_LOCAL)
2493 rtm->rtm_type = RTN_LOCAL;
2494 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2495 rtm->rtm_type = RTN_LOCAL;
2497 rtm->rtm_type = RTN_UNICAST;
2499 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2500 rtm->rtm_protocol = rt->rt6i_protocol;
2501 if (rt->rt6i_flags & RTF_DYNAMIC)
2502 rtm->rtm_protocol = RTPROT_REDIRECT;
2503 else if (rt->rt6i_flags & RTF_ADDRCONF)
2504 rtm->rtm_protocol = RTPROT_KERNEL;
2505 else if (rt->rt6i_flags & RTF_DEFAULT)
2506 rtm->rtm_protocol = RTPROT_RA;
2508 if (rt->rt6i_flags & RTF_CACHE)
2509 rtm->rtm_flags |= RTM_F_CLONED;
2512 if (nla_put(skb, RTA_DST, 16, dst))
2513 goto nla_put_failure;
2514 rtm->rtm_dst_len = 128;
2515 } else if (rtm->rtm_dst_len)
2516 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2517 goto nla_put_failure;
2518 #ifdef CONFIG_IPV6_SUBTREES
2520 if (nla_put(skb, RTA_SRC, 16, src))
2521 goto nla_put_failure;
2522 rtm->rtm_src_len = 128;
2523 } else if (rtm->rtm_src_len &&
2524 nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2525 goto nla_put_failure;
2528 #ifdef CONFIG_IPV6_MROUTE
2529 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2530 int err = ip6mr_get_route(net, skb, rtm, nowait);
2535 goto nla_put_failure;
2537 if (err == -EMSGSIZE)
2538 goto nla_put_failure;
2543 if (nla_put_u32(skb, RTA_IIF, iif))
2544 goto nla_put_failure;
2546 struct in6_addr saddr_buf;
2547 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2548 nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2549 goto nla_put_failure;
2552 if (rt->rt6i_prefsrc.plen) {
2553 struct in6_addr saddr_buf;
2554 saddr_buf = rt->rt6i_prefsrc.addr;
2555 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2556 goto nla_put_failure;
2559 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2560 goto nla_put_failure;
2565 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2567 goto nla_put_failure;
2573 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2574 goto nla_put_failure;
2575 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2576 goto nla_put_failure;
2577 if (!(rt->rt6i_flags & RTF_EXPIRES))
2579 else if (rt->dst.expires - jiffies < INT_MAX)
2580 expires = rt->dst.expires - jiffies;
2584 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2585 goto nla_put_failure;
2587 return nlmsg_end(skb, nlh);
2590 nlmsg_cancel(skb, nlh);
2594 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2596 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2599 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2600 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2601 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2605 return rt6_fill_node(arg->net,
2606 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2607 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2608 prefix, 0, NLM_F_MULTI);
2611 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2613 struct net *net = sock_net(in_skb->sk);
2614 struct nlattr *tb[RTA_MAX+1];
2615 struct rt6_info *rt;
2616 struct sk_buff *skb;
2619 int err, iif = 0, oif = 0;
2621 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2626 memset(&fl6, 0, sizeof(fl6));
2629 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2632 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2636 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2639 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2643 iif = nla_get_u32(tb[RTA_IIF]);
2646 oif = nla_get_u32(tb[RTA_OIF]);
2649 struct net_device *dev;
2652 dev = __dev_get_by_index(net, iif);
2658 fl6.flowi6_iif = iif;
2660 if (!ipv6_addr_any(&fl6.saddr))
2661 flags |= RT6_LOOKUP_F_HAS_SADDR;
2663 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2666 fl6.flowi6_oif = oif;
2668 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2671 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2673 dst_release(&rt->dst);
2678 /* Reserve room for dummy headers, this skb can pass
2679 through good chunk of routing engine.
2681 skb_reset_mac_header(skb);
2682 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2684 skb_dst_set(skb, &rt->dst);
2686 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2687 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2688 nlh->nlmsg_seq, 0, 0, 0);
2694 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2699 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2701 struct sk_buff *skb;
2702 struct net *net = info->nl_net;
2707 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2709 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2713 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2714 event, info->pid, seq, 0, 0, 0);
2716 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2717 WARN_ON(err == -EMSGSIZE);
2721 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2722 info->nlh, gfp_any());
2726 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2729 static int ip6_route_dev_notify(struct notifier_block *this,
2730 unsigned long event, void *data)
2732 struct net_device *dev = (struct net_device *)data;
2733 struct net *net = dev_net(dev);
2735 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2736 net->ipv6.ip6_null_entry->dst.dev = dev;
2737 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2738 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2739 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2740 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2741 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2742 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2753 #ifdef CONFIG_PROC_FS
2764 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2766 struct seq_file *m = p_arg;
2767 struct neighbour *n;
2769 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2771 #ifdef CONFIG_IPV6_SUBTREES
2772 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2774 seq_puts(m, "00000000000000000000000000000000 00 ");
2779 seq_printf(m, "%pi6", n->primary_key);
2781 seq_puts(m, "00000000000000000000000000000000");
2784 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2785 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2786 rt->dst.__use, rt->rt6i_flags,
2787 rt->dst.dev ? rt->dst.dev->name : "");
2791 static int ipv6_route_show(struct seq_file *m, void *v)
2793 struct net *net = (struct net *)m->private;
2794 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2798 static int ipv6_route_open(struct inode *inode, struct file *file)
2800 return single_open_net(inode, file, ipv6_route_show);
2803 static const struct file_operations ipv6_route_proc_fops = {
2804 .owner = THIS_MODULE,
2805 .open = ipv6_route_open,
2807 .llseek = seq_lseek,
2808 .release = single_release_net,
2811 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2813 struct net *net = (struct net *)seq->private;
2814 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2815 net->ipv6.rt6_stats->fib_nodes,
2816 net->ipv6.rt6_stats->fib_route_nodes,
2817 net->ipv6.rt6_stats->fib_rt_alloc,
2818 net->ipv6.rt6_stats->fib_rt_entries,
2819 net->ipv6.rt6_stats->fib_rt_cache,
2820 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2821 net->ipv6.rt6_stats->fib_discarded_routes);
2826 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2828 return single_open_net(inode, file, rt6_stats_seq_show);
2831 static const struct file_operations rt6_stats_seq_fops = {
2832 .owner = THIS_MODULE,
2833 .open = rt6_stats_seq_open,
2835 .llseek = seq_lseek,
2836 .release = single_release_net,
2838 #endif /* CONFIG_PROC_FS */
2840 #ifdef CONFIG_SYSCTL
2843 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2844 void __user *buffer, size_t *lenp, loff_t *ppos)
2851 net = (struct net *)ctl->extra1;
2852 delay = net->ipv6.sysctl.flush_delay;
2853 proc_dointvec(ctl, write, buffer, lenp, ppos);
2854 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2858 ctl_table ipv6_route_table_template[] = {
2860 .procname = "flush",
2861 .data = &init_net.ipv6.sysctl.flush_delay,
2862 .maxlen = sizeof(int),
2864 .proc_handler = ipv6_sysctl_rtcache_flush
2867 .procname = "gc_thresh",
2868 .data = &ip6_dst_ops_template.gc_thresh,
2869 .maxlen = sizeof(int),
2871 .proc_handler = proc_dointvec,
2874 .procname = "max_size",
2875 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2876 .maxlen = sizeof(int),
2878 .proc_handler = proc_dointvec,
2881 .procname = "gc_min_interval",
2882 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2883 .maxlen = sizeof(int),
2885 .proc_handler = proc_dointvec_jiffies,
2888 .procname = "gc_timeout",
2889 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2890 .maxlen = sizeof(int),
2892 .proc_handler = proc_dointvec_jiffies,
2895 .procname = "gc_interval",
2896 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2897 .maxlen = sizeof(int),
2899 .proc_handler = proc_dointvec_jiffies,
2902 .procname = "gc_elasticity",
2903 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2904 .maxlen = sizeof(int),
2906 .proc_handler = proc_dointvec,
2909 .procname = "mtu_expires",
2910 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2911 .maxlen = sizeof(int),
2913 .proc_handler = proc_dointvec_jiffies,
2916 .procname = "min_adv_mss",
2917 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2918 .maxlen = sizeof(int),
2920 .proc_handler = proc_dointvec,
2923 .procname = "gc_min_interval_ms",
2924 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2925 .maxlen = sizeof(int),
2927 .proc_handler = proc_dointvec_ms_jiffies,
2932 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2934 struct ctl_table *table;
2936 table = kmemdup(ipv6_route_table_template,
2937 sizeof(ipv6_route_table_template),
2941 table[0].data = &net->ipv6.sysctl.flush_delay;
2942 table[0].extra1 = net;
2943 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2944 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2945 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2946 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2947 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2948 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2949 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2950 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2951 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2958 static int __net_init ip6_route_net_init(struct net *net)
2962 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2963 sizeof(net->ipv6.ip6_dst_ops));
2965 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2966 goto out_ip6_dst_ops;
2968 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2969 sizeof(*net->ipv6.ip6_null_entry),
2971 if (!net->ipv6.ip6_null_entry)
2972 goto out_ip6_dst_entries;
2973 net->ipv6.ip6_null_entry->dst.path =
2974 (struct dst_entry *)net->ipv6.ip6_null_entry;
2975 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2976 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2977 ip6_template_metrics, true);
2979 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2980 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2981 sizeof(*net->ipv6.ip6_prohibit_entry),
2983 if (!net->ipv6.ip6_prohibit_entry)
2984 goto out_ip6_null_entry;
2985 net->ipv6.ip6_prohibit_entry->dst.path =
2986 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2987 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2988 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2989 ip6_template_metrics, true);
2991 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2992 sizeof(*net->ipv6.ip6_blk_hole_entry),
2994 if (!net->ipv6.ip6_blk_hole_entry)
2995 goto out_ip6_prohibit_entry;
2996 net->ipv6.ip6_blk_hole_entry->dst.path =
2997 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2998 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2999 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3000 ip6_template_metrics, true);
3003 net->ipv6.sysctl.flush_delay = 0;
3004 net->ipv6.sysctl.ip6_rt_max_size = 4096;
3005 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3006 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3007 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3008 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3009 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3010 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3012 net->ipv6.ip6_rt_gc_expire = 30*HZ;
3018 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3019 out_ip6_prohibit_entry:
3020 kfree(net->ipv6.ip6_prohibit_entry);
3022 kfree(net->ipv6.ip6_null_entry);
3024 out_ip6_dst_entries:
3025 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3030 static void __net_exit ip6_route_net_exit(struct net *net)
3032 kfree(net->ipv6.ip6_null_entry);
3033 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3034 kfree(net->ipv6.ip6_prohibit_entry);
3035 kfree(net->ipv6.ip6_blk_hole_entry);
3037 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3040 static int __net_init ip6_route_net_init_late(struct net *net)
3042 #ifdef CONFIG_PROC_FS
3043 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
3044 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
3049 static void __net_exit ip6_route_net_exit_late(struct net *net)
3051 #ifdef CONFIG_PROC_FS
3052 proc_net_remove(net, "ipv6_route");
3053 proc_net_remove(net, "rt6_stats");
3057 static struct pernet_operations ip6_route_net_ops = {
3058 .init = ip6_route_net_init,
3059 .exit = ip6_route_net_exit,
3062 static int __net_init ipv6_inetpeer_init(struct net *net)
3064 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3068 inet_peer_base_init(bp);
3069 net->ipv6.peers = bp;
3073 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3075 struct inet_peer_base *bp = net->ipv6.peers;
3077 net->ipv6.peers = NULL;
3078 inetpeer_invalidate_tree(bp);
3082 static struct pernet_operations ipv6_inetpeer_ops = {
3083 .init = ipv6_inetpeer_init,
3084 .exit = ipv6_inetpeer_exit,
3087 static struct pernet_operations ip6_route_net_late_ops = {
3088 .init = ip6_route_net_init_late,
3089 .exit = ip6_route_net_exit_late,
3092 static struct notifier_block ip6_route_dev_notifier = {
3093 .notifier_call = ip6_route_dev_notify,
3097 int __init ip6_route_init(void)
3102 ip6_dst_ops_template.kmem_cachep =
3103 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3104 SLAB_HWCACHE_ALIGN, NULL);
3105 if (!ip6_dst_ops_template.kmem_cachep)
3108 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3110 goto out_kmem_cache;
3112 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3114 goto out_dst_entries;
3116 ret = register_pernet_subsys(&ip6_route_net_ops);
3118 goto out_register_inetpeer;
3120 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3122 /* Registering of the loopback is done before this portion of code,
3123 * the loopback reference in rt6_info will not be taken, do it
3124 * manually for init_net */
3125 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3126 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3127 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3128 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3129 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3130 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3131 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3135 goto out_register_subsys;
3141 ret = fib6_rules_init();
3145 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3147 goto fib6_rules_init;
3150 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3151 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3152 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3153 goto out_register_late_subsys;
3155 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3157 goto out_register_late_subsys;
3162 out_register_late_subsys:
3163 unregister_pernet_subsys(&ip6_route_net_late_ops);
3165 fib6_rules_cleanup();
3170 out_register_subsys:
3171 unregister_pernet_subsys(&ip6_route_net_ops);
3172 out_register_inetpeer:
3173 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3175 dst_entries_destroy(&ip6_dst_blackhole_ops);
3177 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3181 void ip6_route_cleanup(void)
3183 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3184 unregister_pernet_subsys(&ip6_route_net_late_ops);
3185 fib6_rules_cleanup();
3188 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3189 unregister_pernet_subsys(&ip6_route_net_ops);
3190 dst_entries_destroy(&ip6_dst_blackhole_ops);
3191 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);