ipv6: Move xfrm_lookup() call down into icmp6_dst_alloc().
[firefly-linux-kernel-4.4.55.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return NULL;
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 255,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt)
251                 memset(&rt->rt6i_table, 0,
252                        sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (!sprt->rt6i_idev ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour_noref(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
596                                 dev->ifindex);
597
598         if (rt && !lifetime) {
599                 ip6_del_rt(rt);
600                 rt = NULL;
601         }
602
603         if (!rt && lifetime)
604                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
605                                         pref);
606         else if (rt)
607                 rt->rt6i_flags = RTF_ROUTEINFO |
608                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
609
610         if (rt) {
611                 if (!addrconf_finite_timeout(lifetime)) {
612                         rt->rt6i_flags &= ~RTF_EXPIRES;
613                 } else {
614                         rt->rt6i_expires = jiffies + HZ * lifetime;
615                         rt->rt6i_flags |= RTF_EXPIRES;
616                 }
617                 dst_release(&rt->dst);
618         }
619         return 0;
620 }
621 #endif
622
623 #define BACKTRACK(__net, saddr)                 \
624 do { \
625         if (rt == __net->ipv6.ip6_null_entry) { \
626                 struct fib6_node *pn; \
627                 while (1) { \
628                         if (fn->fn_flags & RTN_TL_ROOT) \
629                                 goto out; \
630                         pn = fn->parent; \
631                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
633                         else \
634                                 fn = pn; \
635                         if (fn->fn_flags & RTN_RTINFO) \
636                                 goto restart; \
637                 } \
638         } \
639 } while (0)
640
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642                                              struct fib6_table *table,
643                                              struct flowi6 *fl6, int flags)
644 {
645         struct fib6_node *fn;
646         struct rt6_info *rt;
647
648         read_lock_bh(&table->tb6_lock);
649         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
650 restart:
651         rt = fn->leaf;
652         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653         BACKTRACK(net, &fl6->saddr);
654 out:
655         dst_use(&rt->dst, jiffies);
656         read_unlock_bh(&table->tb6_lock);
657         return rt;
658
659 }
660
661 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
662                             const struct in6_addr *saddr, int oif, int strict)
663 {
664         struct flowi6 fl6 = {
665                 .flowi6_oif = oif,
666                 .daddr = *daddr,
667         };
668         struct dst_entry *dst;
669         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
670
671         if (saddr) {
672                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673                 flags |= RT6_LOOKUP_F_HAS_SADDR;
674         }
675
676         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
677         if (dst->error == 0)
678                 return (struct rt6_info *) dst;
679
680         dst_release(dst);
681
682         return NULL;
683 }
684
685 EXPORT_SYMBOL(rt6_lookup);
686
687 /* ip6_ins_rt is called with FREE table->tb6_lock.
688    It takes new route entry, the addition fails by any reason the
689    route is freed. In any case, if caller does not hold it, it may
690    be destroyed.
691  */
692
693 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
694 {
695         int err;
696         struct fib6_table *table;
697
698         table = rt->rt6i_table;
699         write_lock_bh(&table->tb6_lock);
700         err = fib6_add(&table->tb6_root, rt, info);
701         write_unlock_bh(&table->tb6_lock);
702
703         return err;
704 }
705
706 int ip6_ins_rt(struct rt6_info *rt)
707 {
708         struct nl_info info = {
709                 .nl_net = dev_net(rt->rt6i_dev),
710         };
711         return __ip6_ins_rt(rt, &info);
712 }
713
714 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
715                                       const struct in6_addr *daddr,
716                                       const struct in6_addr *saddr)
717 {
718         struct rt6_info *rt;
719
720         /*
721          *      Clone the route.
722          */
723
724         rt = ip6_rt_copy(ort, daddr);
725
726         if (rt) {
727                 struct neighbour *neigh;
728                 int attempts = !in_softirq();
729
730                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
731                         if (rt->rt6i_dst.plen != 128 &&
732                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733                                 rt->rt6i_flags |= RTF_ANYCAST;
734                         rt->rt6i_gateway = *daddr;
735                 }
736
737                 rt->rt6i_flags |= RTF_CACHE;
738
739 #ifdef CONFIG_IPV6_SUBTREES
740                 if (rt->rt6i_src.plen && saddr) {
741                         rt->rt6i_src.addr = *saddr;
742                         rt->rt6i_src.plen = 128;
743                 }
744 #endif
745
746         retry:
747                 neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway,
748                                              rt->rt6i_dev);
749                 if (IS_ERR(neigh)) {
750                         struct net *net = dev_net(rt->rt6i_dev);
751                         int saved_rt_min_interval =
752                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
753                         int saved_rt_elasticity =
754                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
755
756                         if (attempts-- > 0) {
757                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
758                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
759
760                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
761
762                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
763                                         saved_rt_elasticity;
764                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
765                                         saved_rt_min_interval;
766                                 goto retry;
767                         }
768
769                         if (net_ratelimit())
770                                 printk(KERN_WARNING
771                                        "ipv6: Neighbour table overflow.\n");
772                         dst_free(&rt->dst);
773                         return NULL;
774                 }
775                 dst_set_neighbour(&rt->dst, neigh);
776
777         }
778
779         return rt;
780 }
781
782 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
783                                         const struct in6_addr *daddr)
784 {
785         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
786
787         if (rt) {
788                 rt->rt6i_flags |= RTF_CACHE;
789                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
790         }
791         return rt;
792 }
793
794 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
795                                       struct flowi6 *fl6, int flags)
796 {
797         struct fib6_node *fn;
798         struct rt6_info *rt, *nrt;
799         int strict = 0;
800         int attempts = 3;
801         int err;
802         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
803
804         strict |= flags & RT6_LOOKUP_F_IFACE;
805
806 relookup:
807         read_lock_bh(&table->tb6_lock);
808
809 restart_2:
810         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
811
812 restart:
813         rt = rt6_select(fn, oif, strict | reachable);
814
815         BACKTRACK(net, &fl6->saddr);
816         if (rt == net->ipv6.ip6_null_entry ||
817             rt->rt6i_flags & RTF_CACHE)
818                 goto out;
819
820         dst_hold(&rt->dst);
821         read_unlock_bh(&table->tb6_lock);
822
823         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
824                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
825         else if (!(rt->dst.flags & DST_HOST))
826                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
827         else
828                 goto out2;
829
830         dst_release(&rt->dst);
831         rt = nrt ? : net->ipv6.ip6_null_entry;
832
833         dst_hold(&rt->dst);
834         if (nrt) {
835                 err = ip6_ins_rt(nrt);
836                 if (!err)
837                         goto out2;
838         }
839
840         if (--attempts <= 0)
841                 goto out2;
842
843         /*
844          * Race condition! In the gap, when table->tb6_lock was
845          * released someone could insert this route.  Relookup.
846          */
847         dst_release(&rt->dst);
848         goto relookup;
849
850 out:
851         if (reachable) {
852                 reachable = 0;
853                 goto restart_2;
854         }
855         dst_hold(&rt->dst);
856         read_unlock_bh(&table->tb6_lock);
857 out2:
858         rt->dst.lastuse = jiffies;
859         rt->dst.__use++;
860
861         return rt;
862 }
863
864 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
865                                             struct flowi6 *fl6, int flags)
866 {
867         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
868 }
869
870 void ip6_route_input(struct sk_buff *skb)
871 {
872         const struct ipv6hdr *iph = ipv6_hdr(skb);
873         struct net *net = dev_net(skb->dev);
874         int flags = RT6_LOOKUP_F_HAS_SADDR;
875         struct flowi6 fl6 = {
876                 .flowi6_iif = skb->dev->ifindex,
877                 .daddr = iph->daddr,
878                 .saddr = iph->saddr,
879                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
880                 .flowi6_mark = skb->mark,
881                 .flowi6_proto = iph->nexthdr,
882         };
883
884         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
885                 flags |= RT6_LOOKUP_F_IFACE;
886
887         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
888 }
889
890 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
891                                              struct flowi6 *fl6, int flags)
892 {
893         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
894 }
895
896 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
897                                     struct flowi6 *fl6)
898 {
899         int flags = 0;
900
901         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
902                 flags |= RT6_LOOKUP_F_IFACE;
903
904         if (!ipv6_addr_any(&fl6->saddr))
905                 flags |= RT6_LOOKUP_F_HAS_SADDR;
906         else if (sk)
907                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
908
909         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
910 }
911
912 EXPORT_SYMBOL(ip6_route_output);
913
914 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
915 {
916         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
917         struct dst_entry *new = NULL;
918
919         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
920         if (rt) {
921                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
922
923                 new = &rt->dst;
924
925                 new->__use = 1;
926                 new->input = dst_discard;
927                 new->output = dst_discard;
928
929                 if (dst_metrics_read_only(&ort->dst))
930                         new->_metrics = ort->dst._metrics;
931                 else
932                         dst_copy_metrics(new, &ort->dst);
933                 rt->rt6i_idev = ort->rt6i_idev;
934                 if (rt->rt6i_idev)
935                         in6_dev_hold(rt->rt6i_idev);
936                 rt->rt6i_expires = 0;
937
938                 rt->rt6i_gateway = ort->rt6i_gateway;
939                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
940                 rt->rt6i_metric = 0;
941
942                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
943 #ifdef CONFIG_IPV6_SUBTREES
944                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
945 #endif
946
947                 dst_free(new);
948         }
949
950         dst_release(dst_orig);
951         return new ? new : ERR_PTR(-ENOMEM);
952 }
953
954 /*
955  *      Destination cache support functions
956  */
957
958 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
959 {
960         struct rt6_info *rt;
961
962         rt = (struct rt6_info *) dst;
963
964         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
965                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
966                         if (!rt->rt6i_peer)
967                                 rt6_bind_peer(rt, 0);
968                         rt->rt6i_peer_genid = rt6_peer_genid();
969                 }
970                 return dst;
971         }
972         return NULL;
973 }
974
975 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
976 {
977         struct rt6_info *rt = (struct rt6_info *) dst;
978
979         if (rt) {
980                 if (rt->rt6i_flags & RTF_CACHE) {
981                         if (rt6_check_expired(rt)) {
982                                 ip6_del_rt(rt);
983                                 dst = NULL;
984                         }
985                 } else {
986                         dst_release(dst);
987                         dst = NULL;
988                 }
989         }
990         return dst;
991 }
992
993 static void ip6_link_failure(struct sk_buff *skb)
994 {
995         struct rt6_info *rt;
996
997         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
998
999         rt = (struct rt6_info *) skb_dst(skb);
1000         if (rt) {
1001                 if (rt->rt6i_flags & RTF_CACHE) {
1002                         dst_set_expires(&rt->dst, 0);
1003                         rt->rt6i_flags |= RTF_EXPIRES;
1004                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1005                         rt->rt6i_node->fn_sernum = -1;
1006         }
1007 }
1008
1009 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1010 {
1011         struct rt6_info *rt6 = (struct rt6_info*)dst;
1012
1013         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1014                 rt6->rt6i_flags |= RTF_MODIFIED;
1015                 if (mtu < IPV6_MIN_MTU) {
1016                         u32 features = dst_metric(dst, RTAX_FEATURES);
1017                         mtu = IPV6_MIN_MTU;
1018                         features |= RTAX_FEATURE_ALLFRAG;
1019                         dst_metric_set(dst, RTAX_FEATURES, features);
1020                 }
1021                 dst_metric_set(dst, RTAX_MTU, mtu);
1022         }
1023 }
1024
1025 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1026 {
1027         struct net_device *dev = dst->dev;
1028         unsigned int mtu = dst_mtu(dst);
1029         struct net *net = dev_net(dev);
1030
1031         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1032
1033         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1034                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1035
1036         /*
1037          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1038          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1039          * IPV6_MAXPLEN is also valid and means: "any MSS,
1040          * rely only on pmtu discovery"
1041          */
1042         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1043                 mtu = IPV6_MAXPLEN;
1044         return mtu;
1045 }
1046
1047 static unsigned int ip6_mtu(const struct dst_entry *dst)
1048 {
1049         struct inet6_dev *idev;
1050         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1051
1052         if (mtu)
1053                 return mtu;
1054
1055         mtu = IPV6_MIN_MTU;
1056
1057         rcu_read_lock();
1058         idev = __in6_dev_get(dst->dev);
1059         if (idev)
1060                 mtu = idev->cnf.mtu6;
1061         rcu_read_unlock();
1062
1063         return mtu;
1064 }
1065
1066 static struct dst_entry *icmp6_dst_gc_list;
1067 static DEFINE_SPINLOCK(icmp6_dst_lock);
1068
1069 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1070                                   struct neighbour *neigh,
1071                                   struct flowi6 *fl6)
1072 {
1073         struct dst_entry *dst;
1074         struct rt6_info *rt;
1075         struct inet6_dev *idev = in6_dev_get(dev);
1076         struct net *net = dev_net(dev);
1077
1078         if (unlikely(!idev))
1079                 return NULL;
1080
1081         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1082         if (unlikely(!rt)) {
1083                 in6_dev_put(idev);
1084                 dst = ERR_PTR(-ENOMEM);
1085                 goto out;
1086         }
1087
1088         if (neigh)
1089                 neigh_hold(neigh);
1090         else {
1091                 neigh = __neigh_lookup_errno(&nd_tbl, &fl6->daddr, dev);
1092                 if (IS_ERR(neigh))
1093                         neigh = NULL;
1094         }
1095
1096         rt->dst.flags |= DST_HOST;
1097         rt->dst.output  = ip6_output;
1098         dst_set_neighbour(&rt->dst, neigh);
1099         atomic_set(&rt->dst.__refcnt, 1);
1100         rt->rt6i_dst.addr = fl6->daddr;
1101         rt->rt6i_dst.plen = 128;
1102         rt->rt6i_idev     = idev;
1103         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1104
1105         spin_lock_bh(&icmp6_dst_lock);
1106         rt->dst.next = icmp6_dst_gc_list;
1107         icmp6_dst_gc_list = &rt->dst;
1108         spin_unlock_bh(&icmp6_dst_lock);
1109
1110         fib6_force_start_gc(net);
1111
1112         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1113
1114 out:
1115         return dst;
1116 }
1117
1118 int icmp6_dst_gc(void)
1119 {
1120         struct dst_entry *dst, **pprev;
1121         int more = 0;
1122
1123         spin_lock_bh(&icmp6_dst_lock);
1124         pprev = &icmp6_dst_gc_list;
1125
1126         while ((dst = *pprev) != NULL) {
1127                 if (!atomic_read(&dst->__refcnt)) {
1128                         *pprev = dst->next;
1129                         dst_free(dst);
1130                 } else {
1131                         pprev = &dst->next;
1132                         ++more;
1133                 }
1134         }
1135
1136         spin_unlock_bh(&icmp6_dst_lock);
1137
1138         return more;
1139 }
1140
1141 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1142                             void *arg)
1143 {
1144         struct dst_entry *dst, **pprev;
1145
1146         spin_lock_bh(&icmp6_dst_lock);
1147         pprev = &icmp6_dst_gc_list;
1148         while ((dst = *pprev) != NULL) {
1149                 struct rt6_info *rt = (struct rt6_info *) dst;
1150                 if (func(rt, arg)) {
1151                         *pprev = dst->next;
1152                         dst_free(dst);
1153                 } else {
1154                         pprev = &dst->next;
1155                 }
1156         }
1157         spin_unlock_bh(&icmp6_dst_lock);
1158 }
1159
1160 static int ip6_dst_gc(struct dst_ops *ops)
1161 {
1162         unsigned long now = jiffies;
1163         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1164         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1165         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1166         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1167         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1168         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1169         int entries;
1170
1171         entries = dst_entries_get_fast(ops);
1172         if (time_after(rt_last_gc + rt_min_interval, now) &&
1173             entries <= rt_max_size)
1174                 goto out;
1175
1176         net->ipv6.ip6_rt_gc_expire++;
1177         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1178         net->ipv6.ip6_rt_last_gc = now;
1179         entries = dst_entries_get_slow(ops);
1180         if (entries < ops->gc_thresh)
1181                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1182 out:
1183         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1184         return entries > rt_max_size;
1185 }
1186
1187 /* Clean host part of a prefix. Not necessary in radix tree,
1188    but results in cleaner routing tables.
1189
1190    Remove it only when all the things will work!
1191  */
1192
1193 int ip6_dst_hoplimit(struct dst_entry *dst)
1194 {
1195         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1196         if (hoplimit == 0) {
1197                 struct net_device *dev = dst->dev;
1198                 struct inet6_dev *idev;
1199
1200                 rcu_read_lock();
1201                 idev = __in6_dev_get(dev);
1202                 if (idev)
1203                         hoplimit = idev->cnf.hop_limit;
1204                 else
1205                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1206                 rcu_read_unlock();
1207         }
1208         return hoplimit;
1209 }
1210 EXPORT_SYMBOL(ip6_dst_hoplimit);
1211
1212 /*
1213  *
1214  */
1215
1216 int ip6_route_add(struct fib6_config *cfg)
1217 {
1218         int err;
1219         struct net *net = cfg->fc_nlinfo.nl_net;
1220         struct rt6_info *rt = NULL;
1221         struct net_device *dev = NULL;
1222         struct inet6_dev *idev = NULL;
1223         struct fib6_table *table;
1224         int addr_type;
1225
1226         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1227                 return -EINVAL;
1228 #ifndef CONFIG_IPV6_SUBTREES
1229         if (cfg->fc_src_len)
1230                 return -EINVAL;
1231 #endif
1232         if (cfg->fc_ifindex) {
1233                 err = -ENODEV;
1234                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1235                 if (!dev)
1236                         goto out;
1237                 idev = in6_dev_get(dev);
1238                 if (!idev)
1239                         goto out;
1240         }
1241
1242         if (cfg->fc_metric == 0)
1243                 cfg->fc_metric = IP6_RT_PRIO_USER;
1244
1245         err = -ENOBUFS;
1246         if (cfg->fc_nlinfo.nlh &&
1247             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1248                 table = fib6_get_table(net, cfg->fc_table);
1249                 if (!table) {
1250                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1251                         table = fib6_new_table(net, cfg->fc_table);
1252                 }
1253         } else {
1254                 table = fib6_new_table(net, cfg->fc_table);
1255         }
1256
1257         if (!table)
1258                 goto out;
1259
1260         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1261
1262         if (!rt) {
1263                 err = -ENOMEM;
1264                 goto out;
1265         }
1266
1267         rt->dst.obsolete = -1;
1268         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1269                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1270                                 0;
1271
1272         if (cfg->fc_protocol == RTPROT_UNSPEC)
1273                 cfg->fc_protocol = RTPROT_BOOT;
1274         rt->rt6i_protocol = cfg->fc_protocol;
1275
1276         addr_type = ipv6_addr_type(&cfg->fc_dst);
1277
1278         if (addr_type & IPV6_ADDR_MULTICAST)
1279                 rt->dst.input = ip6_mc_input;
1280         else if (cfg->fc_flags & RTF_LOCAL)
1281                 rt->dst.input = ip6_input;
1282         else
1283                 rt->dst.input = ip6_forward;
1284
1285         rt->dst.output = ip6_output;
1286
1287         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1288         rt->rt6i_dst.plen = cfg->fc_dst_len;
1289         if (rt->rt6i_dst.plen == 128)
1290                rt->dst.flags |= DST_HOST;
1291
1292         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1293                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1294                 if (!metrics) {
1295                         err = -ENOMEM;
1296                         goto out;
1297                 }
1298                 dst_init_metrics(&rt->dst, metrics, 0);
1299         }
1300 #ifdef CONFIG_IPV6_SUBTREES
1301         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1302         rt->rt6i_src.plen = cfg->fc_src_len;
1303 #endif
1304
1305         rt->rt6i_metric = cfg->fc_metric;
1306
1307         /* We cannot add true routes via loopback here,
1308            they would result in kernel looping; promote them to reject routes
1309          */
1310         if ((cfg->fc_flags & RTF_REJECT) ||
1311             (dev && (dev->flags & IFF_LOOPBACK) &&
1312              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1313              !(cfg->fc_flags & RTF_LOCAL))) {
1314                 /* hold loopback dev/idev if we haven't done so. */
1315                 if (dev != net->loopback_dev) {
1316                         if (dev) {
1317                                 dev_put(dev);
1318                                 in6_dev_put(idev);
1319                         }
1320                         dev = net->loopback_dev;
1321                         dev_hold(dev);
1322                         idev = in6_dev_get(dev);
1323                         if (!idev) {
1324                                 err = -ENODEV;
1325                                 goto out;
1326                         }
1327                 }
1328                 rt->dst.output = ip6_pkt_discard_out;
1329                 rt->dst.input = ip6_pkt_discard;
1330                 rt->dst.error = -ENETUNREACH;
1331                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1332                 goto install_route;
1333         }
1334
1335         if (cfg->fc_flags & RTF_GATEWAY) {
1336                 const struct in6_addr *gw_addr;
1337                 int gwa_type;
1338
1339                 gw_addr = &cfg->fc_gateway;
1340                 rt->rt6i_gateway = *gw_addr;
1341                 gwa_type = ipv6_addr_type(gw_addr);
1342
1343                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1344                         struct rt6_info *grt;
1345
1346                         /* IPv6 strictly inhibits using not link-local
1347                            addresses as nexthop address.
1348                            Otherwise, router will not able to send redirects.
1349                            It is very good, but in some (rare!) circumstances
1350                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1351                            some exceptions. --ANK
1352                          */
1353                         err = -EINVAL;
1354                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1355                                 goto out;
1356
1357                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1358
1359                         err = -EHOSTUNREACH;
1360                         if (!grt)
1361                                 goto out;
1362                         if (dev) {
1363                                 if (dev != grt->rt6i_dev) {
1364                                         dst_release(&grt->dst);
1365                                         goto out;
1366                                 }
1367                         } else {
1368                                 dev = grt->rt6i_dev;
1369                                 idev = grt->rt6i_idev;
1370                                 dev_hold(dev);
1371                                 in6_dev_hold(grt->rt6i_idev);
1372                         }
1373                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1374                                 err = 0;
1375                         dst_release(&grt->dst);
1376
1377                         if (err)
1378                                 goto out;
1379                 }
1380                 err = -EINVAL;
1381                 if (!dev || (dev->flags & IFF_LOOPBACK))
1382                         goto out;
1383         }
1384
1385         err = -ENODEV;
1386         if (!dev)
1387                 goto out;
1388
1389         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1390                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1391                         err = -EINVAL;
1392                         goto out;
1393                 }
1394                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1395                 rt->rt6i_prefsrc.plen = 128;
1396         } else
1397                 rt->rt6i_prefsrc.plen = 0;
1398
1399         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1400                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1401                 if (IS_ERR(n)) {
1402                         err = PTR_ERR(n);
1403                         goto out;
1404                 }
1405                 dst_set_neighbour(&rt->dst, n);
1406         }
1407
1408         rt->rt6i_flags = cfg->fc_flags;
1409
1410 install_route:
1411         if (cfg->fc_mx) {
1412                 struct nlattr *nla;
1413                 int remaining;
1414
1415                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1416                         int type = nla_type(nla);
1417
1418                         if (type) {
1419                                 if (type > RTAX_MAX) {
1420                                         err = -EINVAL;
1421                                         goto out;
1422                                 }
1423
1424                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1425                         }
1426                 }
1427         }
1428
1429         rt->dst.dev = dev;
1430         rt->rt6i_idev = idev;
1431         rt->rt6i_table = table;
1432
1433         cfg->fc_nlinfo.nl_net = dev_net(dev);
1434
1435         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1436
1437 out:
1438         if (dev)
1439                 dev_put(dev);
1440         if (idev)
1441                 in6_dev_put(idev);
1442         if (rt)
1443                 dst_free(&rt->dst);
1444         return err;
1445 }
1446
1447 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1448 {
1449         int err;
1450         struct fib6_table *table;
1451         struct net *net = dev_net(rt->rt6i_dev);
1452
1453         if (rt == net->ipv6.ip6_null_entry)
1454                 return -ENOENT;
1455
1456         table = rt->rt6i_table;
1457         write_lock_bh(&table->tb6_lock);
1458
1459         err = fib6_del(rt, info);
1460         dst_release(&rt->dst);
1461
1462         write_unlock_bh(&table->tb6_lock);
1463
1464         return err;
1465 }
1466
1467 int ip6_del_rt(struct rt6_info *rt)
1468 {
1469         struct nl_info info = {
1470                 .nl_net = dev_net(rt->rt6i_dev),
1471         };
1472         return __ip6_del_rt(rt, &info);
1473 }
1474
1475 static int ip6_route_del(struct fib6_config *cfg)
1476 {
1477         struct fib6_table *table;
1478         struct fib6_node *fn;
1479         struct rt6_info *rt;
1480         int err = -ESRCH;
1481
1482         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1483         if (!table)
1484                 return err;
1485
1486         read_lock_bh(&table->tb6_lock);
1487
1488         fn = fib6_locate(&table->tb6_root,
1489                          &cfg->fc_dst, cfg->fc_dst_len,
1490                          &cfg->fc_src, cfg->fc_src_len);
1491
1492         if (fn) {
1493                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1494                         if (cfg->fc_ifindex &&
1495                             (!rt->rt6i_dev ||
1496                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1497                                 continue;
1498                         if (cfg->fc_flags & RTF_GATEWAY &&
1499                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1500                                 continue;
1501                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1502                                 continue;
1503                         dst_hold(&rt->dst);
1504                         read_unlock_bh(&table->tb6_lock);
1505
1506                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1507                 }
1508         }
1509         read_unlock_bh(&table->tb6_lock);
1510
1511         return err;
1512 }
1513
1514 /*
1515  *      Handle redirects
1516  */
1517 struct ip6rd_flowi {
1518         struct flowi6 fl6;
1519         struct in6_addr gateway;
1520 };
1521
1522 static struct rt6_info *__ip6_route_redirect(struct net *net,
1523                                              struct fib6_table *table,
1524                                              struct flowi6 *fl6,
1525                                              int flags)
1526 {
1527         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1528         struct rt6_info *rt;
1529         struct fib6_node *fn;
1530
1531         /*
1532          * Get the "current" route for this destination and
1533          * check if the redirect has come from approriate router.
1534          *
1535          * RFC 2461 specifies that redirects should only be
1536          * accepted if they come from the nexthop to the target.
1537          * Due to the way the routes are chosen, this notion
1538          * is a bit fuzzy and one might need to check all possible
1539          * routes.
1540          */
1541
1542         read_lock_bh(&table->tb6_lock);
1543         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1544 restart:
1545         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1546                 /*
1547                  * Current route is on-link; redirect is always invalid.
1548                  *
1549                  * Seems, previous statement is not true. It could
1550                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1551                  * But then router serving it might decide, that we should
1552                  * know truth 8)8) --ANK (980726).
1553                  */
1554                 if (rt6_check_expired(rt))
1555                         continue;
1556                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1557                         continue;
1558                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1559                         continue;
1560                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1561                         continue;
1562                 break;
1563         }
1564
1565         if (!rt)
1566                 rt = net->ipv6.ip6_null_entry;
1567         BACKTRACK(net, &fl6->saddr);
1568 out:
1569         dst_hold(&rt->dst);
1570
1571         read_unlock_bh(&table->tb6_lock);
1572
1573         return rt;
1574 };
1575
1576 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1577                                            const struct in6_addr *src,
1578                                            const struct in6_addr *gateway,
1579                                            struct net_device *dev)
1580 {
1581         int flags = RT6_LOOKUP_F_HAS_SADDR;
1582         struct net *net = dev_net(dev);
1583         struct ip6rd_flowi rdfl = {
1584                 .fl6 = {
1585                         .flowi6_oif = dev->ifindex,
1586                         .daddr = *dest,
1587                         .saddr = *src,
1588                 },
1589         };
1590
1591         rdfl.gateway = *gateway;
1592
1593         if (rt6_need_strict(dest))
1594                 flags |= RT6_LOOKUP_F_IFACE;
1595
1596         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1597                                                    flags, __ip6_route_redirect);
1598 }
1599
1600 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1601                   const struct in6_addr *saddr,
1602                   struct neighbour *neigh, u8 *lladdr, int on_link)
1603 {
1604         struct rt6_info *rt, *nrt = NULL;
1605         struct netevent_redirect netevent;
1606         struct net *net = dev_net(neigh->dev);
1607
1608         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1609
1610         if (rt == net->ipv6.ip6_null_entry) {
1611                 if (net_ratelimit())
1612                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1613                                "for redirect target\n");
1614                 goto out;
1615         }
1616
1617         /*
1618          *      We have finally decided to accept it.
1619          */
1620
1621         neigh_update(neigh, lladdr, NUD_STALE,
1622                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1623                      NEIGH_UPDATE_F_OVERRIDE|
1624                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1625                                      NEIGH_UPDATE_F_ISROUTER))
1626                      );
1627
1628         /*
1629          * Redirect received -> path was valid.
1630          * Look, redirects are sent only in response to data packets,
1631          * so that this nexthop apparently is reachable. --ANK
1632          */
1633         dst_confirm(&rt->dst);
1634
1635         /* Duplicate redirect: silently ignore. */
1636         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1637                 goto out;
1638
1639         nrt = ip6_rt_copy(rt, dest);
1640         if (!nrt)
1641                 goto out;
1642
1643         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1644         if (on_link)
1645                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1646
1647         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1648         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1649
1650         if (ip6_ins_rt(nrt))
1651                 goto out;
1652
1653         netevent.old = &rt->dst;
1654         netevent.new = &nrt->dst;
1655         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1656
1657         if (rt->rt6i_flags & RTF_CACHE) {
1658                 ip6_del_rt(rt);
1659                 return;
1660         }
1661
1662 out:
1663         dst_release(&rt->dst);
1664 }
1665
1666 /*
1667  *      Handle ICMP "packet too big" messages
1668  *      i.e. Path MTU discovery
1669  */
1670
1671 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1672                              struct net *net, u32 pmtu, int ifindex)
1673 {
1674         struct rt6_info *rt, *nrt;
1675         int allfrag = 0;
1676 again:
1677         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1678         if (!rt)
1679                 return;
1680
1681         if (rt6_check_expired(rt)) {
1682                 ip6_del_rt(rt);
1683                 goto again;
1684         }
1685
1686         if (pmtu >= dst_mtu(&rt->dst))
1687                 goto out;
1688
1689         if (pmtu < IPV6_MIN_MTU) {
1690                 /*
1691                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1692                  * MTU (1280) and a fragment header should always be included
1693                  * after a node receiving Too Big message reporting PMTU is
1694                  * less than the IPv6 Minimum Link MTU.
1695                  */
1696                 pmtu = IPV6_MIN_MTU;
1697                 allfrag = 1;
1698         }
1699
1700         /* New mtu received -> path was valid.
1701            They are sent only in response to data packets,
1702            so that this nexthop apparently is reachable. --ANK
1703          */
1704         dst_confirm(&rt->dst);
1705
1706         /* Host route. If it is static, it would be better
1707            not to override it, but add new one, so that
1708            when cache entry will expire old pmtu
1709            would return automatically.
1710          */
1711         if (rt->rt6i_flags & RTF_CACHE) {
1712                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1713                 if (allfrag) {
1714                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1715                         features |= RTAX_FEATURE_ALLFRAG;
1716                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1717                 }
1718                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1719                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1720                 goto out;
1721         }
1722
1723         /* Network route.
1724            Two cases are possible:
1725            1. It is connected route. Action: COW
1726            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1727          */
1728         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1729                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1730         else
1731                 nrt = rt6_alloc_clone(rt, daddr);
1732
1733         if (nrt) {
1734                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1735                 if (allfrag) {
1736                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1737                         features |= RTAX_FEATURE_ALLFRAG;
1738                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1739                 }
1740
1741                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1742                  * happened within 5 mins, the recommended timer is 10 mins.
1743                  * Here this route expiration time is set to ip6_rt_mtu_expires
1744                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1745                  * and detecting PMTU increase will be automatically happened.
1746                  */
1747                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1748                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1749
1750                 ip6_ins_rt(nrt);
1751         }
1752 out:
1753         dst_release(&rt->dst);
1754 }
1755
1756 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1757                         struct net_device *dev, u32 pmtu)
1758 {
1759         struct net *net = dev_net(dev);
1760
1761         /*
1762          * RFC 1981 states that a node "MUST reduce the size of the packets it
1763          * is sending along the path" that caused the Packet Too Big message.
1764          * Since it's not possible in the general case to determine which
1765          * interface was used to send the original packet, we update the MTU
1766          * on the interface that will be used to send future packets. We also
1767          * update the MTU on the interface that received the Packet Too Big in
1768          * case the original packet was forced out that interface with
1769          * SO_BINDTODEVICE or similar. This is the next best thing to the
1770          * correct behaviour, which would be to update the MTU on all
1771          * interfaces.
1772          */
1773         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1774         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1775 }
1776
1777 /*
1778  *      Misc support functions
1779  */
1780
1781 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1782                                     const struct in6_addr *dest)
1783 {
1784         struct net *net = dev_net(ort->rt6i_dev);
1785         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1786                                             ort->dst.dev, 0);
1787
1788         if (rt) {
1789                 rt->dst.input = ort->dst.input;
1790                 rt->dst.output = ort->dst.output;
1791                 rt->dst.flags |= DST_HOST;
1792
1793                 rt->rt6i_dst.addr = *dest;
1794                 rt->rt6i_dst.plen = 128;
1795                 dst_copy_metrics(&rt->dst, &ort->dst);
1796                 rt->dst.error = ort->dst.error;
1797                 rt->rt6i_idev = ort->rt6i_idev;
1798                 if (rt->rt6i_idev)
1799                         in6_dev_hold(rt->rt6i_idev);
1800                 rt->dst.lastuse = jiffies;
1801                 rt->rt6i_expires = 0;
1802
1803                 rt->rt6i_gateway = ort->rt6i_gateway;
1804                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1805                 rt->rt6i_metric = 0;
1806
1807 #ifdef CONFIG_IPV6_SUBTREES
1808                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1809 #endif
1810                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1811                 rt->rt6i_table = ort->rt6i_table;
1812         }
1813         return rt;
1814 }
1815
1816 #ifdef CONFIG_IPV6_ROUTE_INFO
1817 static struct rt6_info *rt6_get_route_info(struct net *net,
1818                                            const struct in6_addr *prefix, int prefixlen,
1819                                            const struct in6_addr *gwaddr, int ifindex)
1820 {
1821         struct fib6_node *fn;
1822         struct rt6_info *rt = NULL;
1823         struct fib6_table *table;
1824
1825         table = fib6_get_table(net, RT6_TABLE_INFO);
1826         if (!table)
1827                 return NULL;
1828
1829         write_lock_bh(&table->tb6_lock);
1830         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1831         if (!fn)
1832                 goto out;
1833
1834         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1835                 if (rt->rt6i_dev->ifindex != ifindex)
1836                         continue;
1837                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1838                         continue;
1839                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1840                         continue;
1841                 dst_hold(&rt->dst);
1842                 break;
1843         }
1844 out:
1845         write_unlock_bh(&table->tb6_lock);
1846         return rt;
1847 }
1848
1849 static struct rt6_info *rt6_add_route_info(struct net *net,
1850                                            const struct in6_addr *prefix, int prefixlen,
1851                                            const struct in6_addr *gwaddr, int ifindex,
1852                                            unsigned pref)
1853 {
1854         struct fib6_config cfg = {
1855                 .fc_table       = RT6_TABLE_INFO,
1856                 .fc_metric      = IP6_RT_PRIO_USER,
1857                 .fc_ifindex     = ifindex,
1858                 .fc_dst_len     = prefixlen,
1859                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1860                                   RTF_UP | RTF_PREF(pref),
1861                 .fc_nlinfo.pid = 0,
1862                 .fc_nlinfo.nlh = NULL,
1863                 .fc_nlinfo.nl_net = net,
1864         };
1865
1866         cfg.fc_dst = *prefix;
1867         cfg.fc_gateway = *gwaddr;
1868
1869         /* We should treat it as a default route if prefix length is 0. */
1870         if (!prefixlen)
1871                 cfg.fc_flags |= RTF_DEFAULT;
1872
1873         ip6_route_add(&cfg);
1874
1875         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1876 }
1877 #endif
1878
1879 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1880 {
1881         struct rt6_info *rt;
1882         struct fib6_table *table;
1883
1884         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1885         if (!table)
1886                 return NULL;
1887
1888         write_lock_bh(&table->tb6_lock);
1889         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1890                 if (dev == rt->rt6i_dev &&
1891                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1892                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1893                         break;
1894         }
1895         if (rt)
1896                 dst_hold(&rt->dst);
1897         write_unlock_bh(&table->tb6_lock);
1898         return rt;
1899 }
1900
1901 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1902                                      struct net_device *dev,
1903                                      unsigned int pref)
1904 {
1905         struct fib6_config cfg = {
1906                 .fc_table       = RT6_TABLE_DFLT,
1907                 .fc_metric      = IP6_RT_PRIO_USER,
1908                 .fc_ifindex     = dev->ifindex,
1909                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1910                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1911                 .fc_nlinfo.pid = 0,
1912                 .fc_nlinfo.nlh = NULL,
1913                 .fc_nlinfo.nl_net = dev_net(dev),
1914         };
1915
1916         cfg.fc_gateway = *gwaddr;
1917
1918         ip6_route_add(&cfg);
1919
1920         return rt6_get_dflt_router(gwaddr, dev);
1921 }
1922
1923 void rt6_purge_dflt_routers(struct net *net)
1924 {
1925         struct rt6_info *rt;
1926         struct fib6_table *table;
1927
1928         /* NOTE: Keep consistent with rt6_get_dflt_router */
1929         table = fib6_get_table(net, RT6_TABLE_DFLT);
1930         if (!table)
1931                 return;
1932
1933 restart:
1934         read_lock_bh(&table->tb6_lock);
1935         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1936                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1937                         dst_hold(&rt->dst);
1938                         read_unlock_bh(&table->tb6_lock);
1939                         ip6_del_rt(rt);
1940                         goto restart;
1941                 }
1942         }
1943         read_unlock_bh(&table->tb6_lock);
1944 }
1945
1946 static void rtmsg_to_fib6_config(struct net *net,
1947                                  struct in6_rtmsg *rtmsg,
1948                                  struct fib6_config *cfg)
1949 {
1950         memset(cfg, 0, sizeof(*cfg));
1951
1952         cfg->fc_table = RT6_TABLE_MAIN;
1953         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1954         cfg->fc_metric = rtmsg->rtmsg_metric;
1955         cfg->fc_expires = rtmsg->rtmsg_info;
1956         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1957         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1958         cfg->fc_flags = rtmsg->rtmsg_flags;
1959
1960         cfg->fc_nlinfo.nl_net = net;
1961
1962         cfg->fc_dst = rtmsg->rtmsg_dst;
1963         cfg->fc_src = rtmsg->rtmsg_src;
1964         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1965 }
1966
1967 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1968 {
1969         struct fib6_config cfg;
1970         struct in6_rtmsg rtmsg;
1971         int err;
1972
1973         switch(cmd) {
1974         case SIOCADDRT:         /* Add a route */
1975         case SIOCDELRT:         /* Delete a route */
1976                 if (!capable(CAP_NET_ADMIN))
1977                         return -EPERM;
1978                 err = copy_from_user(&rtmsg, arg,
1979                                      sizeof(struct in6_rtmsg));
1980                 if (err)
1981                         return -EFAULT;
1982
1983                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1984
1985                 rtnl_lock();
1986                 switch (cmd) {
1987                 case SIOCADDRT:
1988                         err = ip6_route_add(&cfg);
1989                         break;
1990                 case SIOCDELRT:
1991                         err = ip6_route_del(&cfg);
1992                         break;
1993                 default:
1994                         err = -EINVAL;
1995                 }
1996                 rtnl_unlock();
1997
1998                 return err;
1999         }
2000
2001         return -EINVAL;
2002 }
2003
2004 /*
2005  *      Drop the packet on the floor
2006  */
2007
2008 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2009 {
2010         int type;
2011         struct dst_entry *dst = skb_dst(skb);
2012         switch (ipstats_mib_noroutes) {
2013         case IPSTATS_MIB_INNOROUTES:
2014                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2015                 if (type == IPV6_ADDR_ANY) {
2016                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2017                                       IPSTATS_MIB_INADDRERRORS);
2018                         break;
2019                 }
2020                 /* FALLTHROUGH */
2021         case IPSTATS_MIB_OUTNOROUTES:
2022                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2023                               ipstats_mib_noroutes);
2024                 break;
2025         }
2026         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2027         kfree_skb(skb);
2028         return 0;
2029 }
2030
2031 static int ip6_pkt_discard(struct sk_buff *skb)
2032 {
2033         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2034 }
2035
2036 static int ip6_pkt_discard_out(struct sk_buff *skb)
2037 {
2038         skb->dev = skb_dst(skb)->dev;
2039         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2040 }
2041
2042 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2043
2044 static int ip6_pkt_prohibit(struct sk_buff *skb)
2045 {
2046         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2047 }
2048
2049 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2050 {
2051         skb->dev = skb_dst(skb)->dev;
2052         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2053 }
2054
2055 #endif
2056
2057 /*
2058  *      Allocate a dst for local (unicast / anycast) address.
2059  */
2060
2061 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2062                                     const struct in6_addr *addr,
2063                                     bool anycast)
2064 {
2065         struct net *net = dev_net(idev->dev);
2066         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2067                                             net->loopback_dev, 0);
2068         struct neighbour *neigh;
2069
2070         if (!rt) {
2071                 if (net_ratelimit())
2072                         pr_warning("IPv6:  Maximum number of routes reached,"
2073                                    " consider increasing route/max_size.\n");
2074                 return ERR_PTR(-ENOMEM);
2075         }
2076
2077         in6_dev_hold(idev);
2078
2079         rt->dst.flags |= DST_HOST;
2080         rt->dst.input = ip6_input;
2081         rt->dst.output = ip6_output;
2082         rt->rt6i_idev = idev;
2083         rt->dst.obsolete = -1;
2084
2085         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2086         if (anycast)
2087                 rt->rt6i_flags |= RTF_ANYCAST;
2088         else
2089                 rt->rt6i_flags |= RTF_LOCAL;
2090         neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev);
2091         if (IS_ERR(neigh)) {
2092                 dst_free(&rt->dst);
2093
2094                 return ERR_CAST(neigh);
2095         }
2096         dst_set_neighbour(&rt->dst, neigh);
2097
2098         rt->rt6i_dst.addr = *addr;
2099         rt->rt6i_dst.plen = 128;
2100         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2101
2102         atomic_set(&rt->dst.__refcnt, 1);
2103
2104         return rt;
2105 }
2106
2107 int ip6_route_get_saddr(struct net *net,
2108                         struct rt6_info *rt,
2109                         const struct in6_addr *daddr,
2110                         unsigned int prefs,
2111                         struct in6_addr *saddr)
2112 {
2113         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2114         int err = 0;
2115         if (rt->rt6i_prefsrc.plen)
2116                 *saddr = rt->rt6i_prefsrc.addr;
2117         else
2118                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2119                                          daddr, prefs, saddr);
2120         return err;
2121 }
2122
2123 /* remove deleted ip from prefsrc entries */
2124 struct arg_dev_net_ip {
2125         struct net_device *dev;
2126         struct net *net;
2127         struct in6_addr *addr;
2128 };
2129
2130 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2131 {
2132         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2133         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2134         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2135
2136         if (((void *)rt->rt6i_dev == dev || !dev) &&
2137             rt != net->ipv6.ip6_null_entry &&
2138             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2139                 /* remove prefsrc entry */
2140                 rt->rt6i_prefsrc.plen = 0;
2141         }
2142         return 0;
2143 }
2144
2145 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2146 {
2147         struct net *net = dev_net(ifp->idev->dev);
2148         struct arg_dev_net_ip adni = {
2149                 .dev = ifp->idev->dev,
2150                 .net = net,
2151                 .addr = &ifp->addr,
2152         };
2153         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2154 }
2155
2156 struct arg_dev_net {
2157         struct net_device *dev;
2158         struct net *net;
2159 };
2160
2161 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2162 {
2163         const struct arg_dev_net *adn = arg;
2164         const struct net_device *dev = adn->dev;
2165
2166         if ((rt->rt6i_dev == dev || !dev) &&
2167             rt != adn->net->ipv6.ip6_null_entry) {
2168                 RT6_TRACE("deleted by ifdown %p\n", rt);
2169                 return -1;
2170         }
2171         return 0;
2172 }
2173
2174 void rt6_ifdown(struct net *net, struct net_device *dev)
2175 {
2176         struct arg_dev_net adn = {
2177                 .dev = dev,
2178                 .net = net,
2179         };
2180
2181         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2182         icmp6_clean_all(fib6_ifdown, &adn);
2183 }
2184
2185 struct rt6_mtu_change_arg
2186 {
2187         struct net_device *dev;
2188         unsigned mtu;
2189 };
2190
2191 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2192 {
2193         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2194         struct inet6_dev *idev;
2195
2196         /* In IPv6 pmtu discovery is not optional,
2197            so that RTAX_MTU lock cannot disable it.
2198            We still use this lock to block changes
2199            caused by addrconf/ndisc.
2200         */
2201
2202         idev = __in6_dev_get(arg->dev);
2203         if (!idev)
2204                 return 0;
2205
2206         /* For administrative MTU increase, there is no way to discover
2207            IPv6 PMTU increase, so PMTU increase should be updated here.
2208            Since RFC 1981 doesn't include administrative MTU increase
2209            update PMTU increase is a MUST. (i.e. jumbo frame)
2210          */
2211         /*
2212            If new MTU is less than route PMTU, this new MTU will be the
2213            lowest MTU in the path, update the route PMTU to reflect PMTU
2214            decreases; if new MTU is greater than route PMTU, and the
2215            old MTU is the lowest MTU in the path, update the route PMTU
2216            to reflect the increase. In this case if the other nodes' MTU
2217            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2218            PMTU discouvery.
2219          */
2220         if (rt->rt6i_dev == arg->dev &&
2221             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2222             (dst_mtu(&rt->dst) >= arg->mtu ||
2223              (dst_mtu(&rt->dst) < arg->mtu &&
2224               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2225                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2226         }
2227         return 0;
2228 }
2229
2230 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2231 {
2232         struct rt6_mtu_change_arg arg = {
2233                 .dev = dev,
2234                 .mtu = mtu,
2235         };
2236
2237         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2238 }
2239
2240 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2241         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2242         [RTA_OIF]               = { .type = NLA_U32 },
2243         [RTA_IIF]               = { .type = NLA_U32 },
2244         [RTA_PRIORITY]          = { .type = NLA_U32 },
2245         [RTA_METRICS]           = { .type = NLA_NESTED },
2246 };
2247
2248 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2249                               struct fib6_config *cfg)
2250 {
2251         struct rtmsg *rtm;
2252         struct nlattr *tb[RTA_MAX+1];
2253         int err;
2254
2255         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2256         if (err < 0)
2257                 goto errout;
2258
2259         err = -EINVAL;
2260         rtm = nlmsg_data(nlh);
2261         memset(cfg, 0, sizeof(*cfg));
2262
2263         cfg->fc_table = rtm->rtm_table;
2264         cfg->fc_dst_len = rtm->rtm_dst_len;
2265         cfg->fc_src_len = rtm->rtm_src_len;
2266         cfg->fc_flags = RTF_UP;
2267         cfg->fc_protocol = rtm->rtm_protocol;
2268
2269         if (rtm->rtm_type == RTN_UNREACHABLE)
2270                 cfg->fc_flags |= RTF_REJECT;
2271
2272         if (rtm->rtm_type == RTN_LOCAL)
2273                 cfg->fc_flags |= RTF_LOCAL;
2274
2275         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2276         cfg->fc_nlinfo.nlh = nlh;
2277         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2278
2279         if (tb[RTA_GATEWAY]) {
2280                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2281                 cfg->fc_flags |= RTF_GATEWAY;
2282         }
2283
2284         if (tb[RTA_DST]) {
2285                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2286
2287                 if (nla_len(tb[RTA_DST]) < plen)
2288                         goto errout;
2289
2290                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2291         }
2292
2293         if (tb[RTA_SRC]) {
2294                 int plen = (rtm->rtm_src_len + 7) >> 3;
2295
2296                 if (nla_len(tb[RTA_SRC]) < plen)
2297                         goto errout;
2298
2299                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2300         }
2301
2302         if (tb[RTA_PREFSRC])
2303                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2304
2305         if (tb[RTA_OIF])
2306                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2307
2308         if (tb[RTA_PRIORITY])
2309                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2310
2311         if (tb[RTA_METRICS]) {
2312                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2313                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2314         }
2315
2316         if (tb[RTA_TABLE])
2317                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2318
2319         err = 0;
2320 errout:
2321         return err;
2322 }
2323
2324 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2325 {
2326         struct fib6_config cfg;
2327         int err;
2328
2329         err = rtm_to_fib6_config(skb, nlh, &cfg);
2330         if (err < 0)
2331                 return err;
2332
2333         return ip6_route_del(&cfg);
2334 }
2335
2336 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2337 {
2338         struct fib6_config cfg;
2339         int err;
2340
2341         err = rtm_to_fib6_config(skb, nlh, &cfg);
2342         if (err < 0)
2343                 return err;
2344
2345         return ip6_route_add(&cfg);
2346 }
2347
2348 static inline size_t rt6_nlmsg_size(void)
2349 {
2350         return NLMSG_ALIGN(sizeof(struct rtmsg))
2351                + nla_total_size(16) /* RTA_SRC */
2352                + nla_total_size(16) /* RTA_DST */
2353                + nla_total_size(16) /* RTA_GATEWAY */
2354                + nla_total_size(16) /* RTA_PREFSRC */
2355                + nla_total_size(4) /* RTA_TABLE */
2356                + nla_total_size(4) /* RTA_IIF */
2357                + nla_total_size(4) /* RTA_OIF */
2358                + nla_total_size(4) /* RTA_PRIORITY */
2359                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2360                + nla_total_size(sizeof(struct rta_cacheinfo));
2361 }
2362
2363 static int rt6_fill_node(struct net *net,
2364                          struct sk_buff *skb, struct rt6_info *rt,
2365                          struct in6_addr *dst, struct in6_addr *src,
2366                          int iif, int type, u32 pid, u32 seq,
2367                          int prefix, int nowait, unsigned int flags)
2368 {
2369         struct rtmsg *rtm;
2370         struct nlmsghdr *nlh;
2371         long expires;
2372         u32 table;
2373         struct neighbour *n;
2374
2375         if (prefix) {   /* user wants prefix routes only */
2376                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2377                         /* success since this is not a prefix route */
2378                         return 1;
2379                 }
2380         }
2381
2382         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2383         if (!nlh)
2384                 return -EMSGSIZE;
2385
2386         rtm = nlmsg_data(nlh);
2387         rtm->rtm_family = AF_INET6;
2388         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2389         rtm->rtm_src_len = rt->rt6i_src.plen;
2390         rtm->rtm_tos = 0;
2391         if (rt->rt6i_table)
2392                 table = rt->rt6i_table->tb6_id;
2393         else
2394                 table = RT6_TABLE_UNSPEC;
2395         rtm->rtm_table = table;
2396         NLA_PUT_U32(skb, RTA_TABLE, table);
2397         if (rt->rt6i_flags & RTF_REJECT)
2398                 rtm->rtm_type = RTN_UNREACHABLE;
2399         else if (rt->rt6i_flags & RTF_LOCAL)
2400                 rtm->rtm_type = RTN_LOCAL;
2401         else if (rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
2402                 rtm->rtm_type = RTN_LOCAL;
2403         else
2404                 rtm->rtm_type = RTN_UNICAST;
2405         rtm->rtm_flags = 0;
2406         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2407         rtm->rtm_protocol = rt->rt6i_protocol;
2408         if (rt->rt6i_flags & RTF_DYNAMIC)
2409                 rtm->rtm_protocol = RTPROT_REDIRECT;
2410         else if (rt->rt6i_flags & RTF_ADDRCONF)
2411                 rtm->rtm_protocol = RTPROT_KERNEL;
2412         else if (rt->rt6i_flags & RTF_DEFAULT)
2413                 rtm->rtm_protocol = RTPROT_RA;
2414
2415         if (rt->rt6i_flags & RTF_CACHE)
2416                 rtm->rtm_flags |= RTM_F_CLONED;
2417
2418         if (dst) {
2419                 NLA_PUT(skb, RTA_DST, 16, dst);
2420                 rtm->rtm_dst_len = 128;
2421         } else if (rtm->rtm_dst_len)
2422                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2423 #ifdef CONFIG_IPV6_SUBTREES
2424         if (src) {
2425                 NLA_PUT(skb, RTA_SRC, 16, src);
2426                 rtm->rtm_src_len = 128;
2427         } else if (rtm->rtm_src_len)
2428                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2429 #endif
2430         if (iif) {
2431 #ifdef CONFIG_IPV6_MROUTE
2432                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2433                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2434                         if (err <= 0) {
2435                                 if (!nowait) {
2436                                         if (err == 0)
2437                                                 return 0;
2438                                         goto nla_put_failure;
2439                                 } else {
2440                                         if (err == -EMSGSIZE)
2441                                                 goto nla_put_failure;
2442                                 }
2443                         }
2444                 } else
2445 #endif
2446                         NLA_PUT_U32(skb, RTA_IIF, iif);
2447         } else if (dst) {
2448                 struct in6_addr saddr_buf;
2449                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2450                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2451         }
2452
2453         if (rt->rt6i_prefsrc.plen) {
2454                 struct in6_addr saddr_buf;
2455                 saddr_buf = rt->rt6i_prefsrc.addr;
2456                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2457         }
2458
2459         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2460                 goto nla_put_failure;
2461
2462         rcu_read_lock();
2463         n = dst_get_neighbour_noref(&rt->dst);
2464         if (n)
2465                 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2466         rcu_read_unlock();
2467
2468         if (rt->dst.dev)
2469                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2470
2471         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2472
2473         if (!(rt->rt6i_flags & RTF_EXPIRES))
2474                 expires = 0;
2475         else if (rt->rt6i_expires - jiffies < INT_MAX)
2476                 expires = rt->rt6i_expires - jiffies;
2477         else
2478                 expires = INT_MAX;
2479
2480         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2481                                expires, rt->dst.error) < 0)
2482                 goto nla_put_failure;
2483
2484         return nlmsg_end(skb, nlh);
2485
2486 nla_put_failure:
2487         nlmsg_cancel(skb, nlh);
2488         return -EMSGSIZE;
2489 }
2490
2491 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2492 {
2493         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2494         int prefix;
2495
2496         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2497                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2498                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2499         } else
2500                 prefix = 0;
2501
2502         return rt6_fill_node(arg->net,
2503                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2504                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2505                      prefix, 0, NLM_F_MULTI);
2506 }
2507
2508 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2509 {
2510         struct net *net = sock_net(in_skb->sk);
2511         struct nlattr *tb[RTA_MAX+1];
2512         struct rt6_info *rt;
2513         struct sk_buff *skb;
2514         struct rtmsg *rtm;
2515         struct flowi6 fl6;
2516         int err, iif = 0;
2517
2518         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2519         if (err < 0)
2520                 goto errout;
2521
2522         err = -EINVAL;
2523         memset(&fl6, 0, sizeof(fl6));
2524
2525         if (tb[RTA_SRC]) {
2526                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2527                         goto errout;
2528
2529                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2530         }
2531
2532         if (tb[RTA_DST]) {
2533                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2534                         goto errout;
2535
2536                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2537         }
2538
2539         if (tb[RTA_IIF])
2540                 iif = nla_get_u32(tb[RTA_IIF]);
2541
2542         if (tb[RTA_OIF])
2543                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2544
2545         if (iif) {
2546                 struct net_device *dev;
2547                 dev = __dev_get_by_index(net, iif);
2548                 if (!dev) {
2549                         err = -ENODEV;
2550                         goto errout;
2551                 }
2552         }
2553
2554         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2555         if (!skb) {
2556                 err = -ENOBUFS;
2557                 goto errout;
2558         }
2559
2560         /* Reserve room for dummy headers, this skb can pass
2561            through good chunk of routing engine.
2562          */
2563         skb_reset_mac_header(skb);
2564         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2565
2566         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2567         skb_dst_set(skb, &rt->dst);
2568
2569         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2570                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2571                             nlh->nlmsg_seq, 0, 0, 0);
2572         if (err < 0) {
2573                 kfree_skb(skb);
2574                 goto errout;
2575         }
2576
2577         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2578 errout:
2579         return err;
2580 }
2581
2582 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2583 {
2584         struct sk_buff *skb;
2585         struct net *net = info->nl_net;
2586         u32 seq;
2587         int err;
2588
2589         err = -ENOBUFS;
2590         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2591
2592         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2593         if (!skb)
2594                 goto errout;
2595
2596         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2597                                 event, info->pid, seq, 0, 0, 0);
2598         if (err < 0) {
2599                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2600                 WARN_ON(err == -EMSGSIZE);
2601                 kfree_skb(skb);
2602                 goto errout;
2603         }
2604         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2605                     info->nlh, gfp_any());
2606         return;
2607 errout:
2608         if (err < 0)
2609                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2610 }
2611
2612 static int ip6_route_dev_notify(struct notifier_block *this,
2613                                 unsigned long event, void *data)
2614 {
2615         struct net_device *dev = (struct net_device *)data;
2616         struct net *net = dev_net(dev);
2617
2618         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2619                 net->ipv6.ip6_null_entry->dst.dev = dev;
2620                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2621 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2622                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2623                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2624                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2625                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2626 #endif
2627         }
2628
2629         return NOTIFY_OK;
2630 }
2631
2632 /*
2633  *      /proc
2634  */
2635
2636 #ifdef CONFIG_PROC_FS
2637
2638 struct rt6_proc_arg
2639 {
2640         char *buffer;
2641         int offset;
2642         int length;
2643         int skip;
2644         int len;
2645 };
2646
2647 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2648 {
2649         struct seq_file *m = p_arg;
2650         struct neighbour *n;
2651
2652         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2653
2654 #ifdef CONFIG_IPV6_SUBTREES
2655         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2656 #else
2657         seq_puts(m, "00000000000000000000000000000000 00 ");
2658 #endif
2659         rcu_read_lock();
2660         n = dst_get_neighbour_noref(&rt->dst);
2661         if (n) {
2662                 seq_printf(m, "%pi6", n->primary_key);
2663         } else {
2664                 seq_puts(m, "00000000000000000000000000000000");
2665         }
2666         rcu_read_unlock();
2667         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2668                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2669                    rt->dst.__use, rt->rt6i_flags,
2670                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2671         return 0;
2672 }
2673
2674 static int ipv6_route_show(struct seq_file *m, void *v)
2675 {
2676         struct net *net = (struct net *)m->private;
2677         fib6_clean_all(net, rt6_info_route, 0, m);
2678         return 0;
2679 }
2680
2681 static int ipv6_route_open(struct inode *inode, struct file *file)
2682 {
2683         return single_open_net(inode, file, ipv6_route_show);
2684 }
2685
2686 static const struct file_operations ipv6_route_proc_fops = {
2687         .owner          = THIS_MODULE,
2688         .open           = ipv6_route_open,
2689         .read           = seq_read,
2690         .llseek         = seq_lseek,
2691         .release        = single_release_net,
2692 };
2693
2694 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2695 {
2696         struct net *net = (struct net *)seq->private;
2697         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2698                    net->ipv6.rt6_stats->fib_nodes,
2699                    net->ipv6.rt6_stats->fib_route_nodes,
2700                    net->ipv6.rt6_stats->fib_rt_alloc,
2701                    net->ipv6.rt6_stats->fib_rt_entries,
2702                    net->ipv6.rt6_stats->fib_rt_cache,
2703                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2704                    net->ipv6.rt6_stats->fib_discarded_routes);
2705
2706         return 0;
2707 }
2708
2709 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2710 {
2711         return single_open_net(inode, file, rt6_stats_seq_show);
2712 }
2713
2714 static const struct file_operations rt6_stats_seq_fops = {
2715         .owner   = THIS_MODULE,
2716         .open    = rt6_stats_seq_open,
2717         .read    = seq_read,
2718         .llseek  = seq_lseek,
2719         .release = single_release_net,
2720 };
2721 #endif  /* CONFIG_PROC_FS */
2722
2723 #ifdef CONFIG_SYSCTL
2724
2725 static
2726 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2727                               void __user *buffer, size_t *lenp, loff_t *ppos)
2728 {
2729         struct net *net;
2730         int delay;
2731         if (!write)
2732                 return -EINVAL;
2733
2734         net = (struct net *)ctl->extra1;
2735         delay = net->ipv6.sysctl.flush_delay;
2736         proc_dointvec(ctl, write, buffer, lenp, ppos);
2737         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2738         return 0;
2739 }
2740
2741 ctl_table ipv6_route_table_template[] = {
2742         {
2743                 .procname       =       "flush",
2744                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2745                 .maxlen         =       sizeof(int),
2746                 .mode           =       0200,
2747                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2748         },
2749         {
2750                 .procname       =       "gc_thresh",
2751                 .data           =       &ip6_dst_ops_template.gc_thresh,
2752                 .maxlen         =       sizeof(int),
2753                 .mode           =       0644,
2754                 .proc_handler   =       proc_dointvec,
2755         },
2756         {
2757                 .procname       =       "max_size",
2758                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2759                 .maxlen         =       sizeof(int),
2760                 .mode           =       0644,
2761                 .proc_handler   =       proc_dointvec,
2762         },
2763         {
2764                 .procname       =       "gc_min_interval",
2765                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2766                 .maxlen         =       sizeof(int),
2767                 .mode           =       0644,
2768                 .proc_handler   =       proc_dointvec_jiffies,
2769         },
2770         {
2771                 .procname       =       "gc_timeout",
2772                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2773                 .maxlen         =       sizeof(int),
2774                 .mode           =       0644,
2775                 .proc_handler   =       proc_dointvec_jiffies,
2776         },
2777         {
2778                 .procname       =       "gc_interval",
2779                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2780                 .maxlen         =       sizeof(int),
2781                 .mode           =       0644,
2782                 .proc_handler   =       proc_dointvec_jiffies,
2783         },
2784         {
2785                 .procname       =       "gc_elasticity",
2786                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2787                 .maxlen         =       sizeof(int),
2788                 .mode           =       0644,
2789                 .proc_handler   =       proc_dointvec,
2790         },
2791         {
2792                 .procname       =       "mtu_expires",
2793                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2794                 .maxlen         =       sizeof(int),
2795                 .mode           =       0644,
2796                 .proc_handler   =       proc_dointvec_jiffies,
2797         },
2798         {
2799                 .procname       =       "min_adv_mss",
2800                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2801                 .maxlen         =       sizeof(int),
2802                 .mode           =       0644,
2803                 .proc_handler   =       proc_dointvec,
2804         },
2805         {
2806                 .procname       =       "gc_min_interval_ms",
2807                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2808                 .maxlen         =       sizeof(int),
2809                 .mode           =       0644,
2810                 .proc_handler   =       proc_dointvec_ms_jiffies,
2811         },
2812         { }
2813 };
2814
2815 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2816 {
2817         struct ctl_table *table;
2818
2819         table = kmemdup(ipv6_route_table_template,
2820                         sizeof(ipv6_route_table_template),
2821                         GFP_KERNEL);
2822
2823         if (table) {
2824                 table[0].data = &net->ipv6.sysctl.flush_delay;
2825                 table[0].extra1 = net;
2826                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2827                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2828                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2829                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2830                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2831                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2832                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2833                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2834                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2835         }
2836
2837         return table;
2838 }
2839 #endif
2840
2841 static int __net_init ip6_route_net_init(struct net *net)
2842 {
2843         int ret = -ENOMEM;
2844
2845         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2846                sizeof(net->ipv6.ip6_dst_ops));
2847
2848         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2849                 goto out_ip6_dst_ops;
2850
2851         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2852                                            sizeof(*net->ipv6.ip6_null_entry),
2853                                            GFP_KERNEL);
2854         if (!net->ipv6.ip6_null_entry)
2855                 goto out_ip6_dst_entries;
2856         net->ipv6.ip6_null_entry->dst.path =
2857                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2858         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2859         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2860                          ip6_template_metrics, true);
2861
2862 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2863         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2864                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2865                                                GFP_KERNEL);
2866         if (!net->ipv6.ip6_prohibit_entry)
2867                 goto out_ip6_null_entry;
2868         net->ipv6.ip6_prohibit_entry->dst.path =
2869                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2870         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2871         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2872                          ip6_template_metrics, true);
2873
2874         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2875                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2876                                                GFP_KERNEL);
2877         if (!net->ipv6.ip6_blk_hole_entry)
2878                 goto out_ip6_prohibit_entry;
2879         net->ipv6.ip6_blk_hole_entry->dst.path =
2880                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2881         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2882         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2883                          ip6_template_metrics, true);
2884 #endif
2885
2886         net->ipv6.sysctl.flush_delay = 0;
2887         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2888         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2889         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2890         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2891         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2892         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2893         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2894
2895 #ifdef CONFIG_PROC_FS
2896         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2897         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2898 #endif
2899         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2900
2901         ret = 0;
2902 out:
2903         return ret;
2904
2905 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2906 out_ip6_prohibit_entry:
2907         kfree(net->ipv6.ip6_prohibit_entry);
2908 out_ip6_null_entry:
2909         kfree(net->ipv6.ip6_null_entry);
2910 #endif
2911 out_ip6_dst_entries:
2912         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2913 out_ip6_dst_ops:
2914         goto out;
2915 }
2916
2917 static void __net_exit ip6_route_net_exit(struct net *net)
2918 {
2919 #ifdef CONFIG_PROC_FS
2920         proc_net_remove(net, "ipv6_route");
2921         proc_net_remove(net, "rt6_stats");
2922 #endif
2923         kfree(net->ipv6.ip6_null_entry);
2924 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2925         kfree(net->ipv6.ip6_prohibit_entry);
2926         kfree(net->ipv6.ip6_blk_hole_entry);
2927 #endif
2928         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2929 }
2930
2931 static struct pernet_operations ip6_route_net_ops = {
2932         .init = ip6_route_net_init,
2933         .exit = ip6_route_net_exit,
2934 };
2935
2936 static struct notifier_block ip6_route_dev_notifier = {
2937         .notifier_call = ip6_route_dev_notify,
2938         .priority = 0,
2939 };
2940
2941 int __init ip6_route_init(void)
2942 {
2943         int ret;
2944
2945         ret = -ENOMEM;
2946         ip6_dst_ops_template.kmem_cachep =
2947                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2948                                   SLAB_HWCACHE_ALIGN, NULL);
2949         if (!ip6_dst_ops_template.kmem_cachep)
2950                 goto out;
2951
2952         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2953         if (ret)
2954                 goto out_kmem_cache;
2955
2956         ret = register_pernet_subsys(&ip6_route_net_ops);
2957         if (ret)
2958                 goto out_dst_entries;
2959
2960         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2961
2962         /* Registering of the loopback is done before this portion of code,
2963          * the loopback reference in rt6_info will not be taken, do it
2964          * manually for init_net */
2965         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2966         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2967   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2968         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2969         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2970         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2971         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2972   #endif
2973         ret = fib6_init();
2974         if (ret)
2975                 goto out_register_subsys;
2976
2977         ret = xfrm6_init();
2978         if (ret)
2979                 goto out_fib6_init;
2980
2981         ret = fib6_rules_init();
2982         if (ret)
2983                 goto xfrm6_init;
2984
2985         ret = -ENOBUFS;
2986         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2987             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2988             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2989                 goto fib6_rules_init;
2990
2991         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2992         if (ret)
2993                 goto fib6_rules_init;
2994
2995 out:
2996         return ret;
2997
2998 fib6_rules_init:
2999         fib6_rules_cleanup();
3000 xfrm6_init:
3001         xfrm6_fini();
3002 out_fib6_init:
3003         fib6_gc_cleanup();
3004 out_register_subsys:
3005         unregister_pernet_subsys(&ip6_route_net_ops);
3006 out_dst_entries:
3007         dst_entries_destroy(&ip6_dst_blackhole_ops);
3008 out_kmem_cache:
3009         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3010         goto out;
3011 }
3012
3013 void ip6_route_cleanup(void)
3014 {
3015         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3016         fib6_rules_cleanup();
3017         xfrm6_fini();
3018         fib6_gc_cleanup();
3019         unregister_pernet_subsys(&ip6_route_net_ops);
3020         dst_entries_destroy(&ip6_dst_blackhole_ops);
3021         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3022 }