ipv6: Fix 'inet6_rtm_getroute' to release 'rt->dst' in case of 'alloc_skb' failure
[firefly-linux-kernel-4.4.55.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
66                                     const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int      ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void             ip6_dst_destroy(struct dst_entry *);
72 static void             ip6_dst_ifdown(struct dst_entry *,
73                                        struct net_device *dev, int how);
74 static int               ip6_dst_gc(struct dst_ops *ops);
75
76 static int              ip6_pkt_discard(struct sk_buff *skb);
77 static int              ip6_pkt_discard_out(struct sk_buff *skb);
78 static void             ip6_link_failure(struct sk_buff *skb);
79 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
80
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83                                            const struct in6_addr *prefix, int prefixlen,
84                                            const struct in6_addr *gwaddr, int ifindex,
85                                            unsigned pref);
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87                                            const struct in6_addr *prefix, int prefixlen,
88                                            const struct in6_addr *gwaddr, int ifindex);
89 #endif
90
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
92 {
93         struct rt6_info *rt = (struct rt6_info *) dst;
94         struct inet_peer *peer;
95         u32 *p = NULL;
96
97         if (!(rt->dst.flags & DST_HOST))
98                 return NULL;
99
100         if (!rt->rt6i_peer)
101                 rt6_bind_peer(rt, 1);
102
103         peer = rt->rt6i_peer;
104         if (peer) {
105                 u32 *old_p = __DST_METRICS_PTR(old);
106                 unsigned long prev, new;
107
108                 p = peer->metrics;
109                 if (inet_metrics_new(peer))
110                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111
112                 new = (unsigned long) p;
113                 prev = cmpxchg(&dst->_metrics, old, new);
114
115                 if (prev != old) {
116                         p = __DST_METRICS_PTR(prev);
117                         if (prev & DST_METRICS_READ_ONLY)
118                                 p = NULL;
119                 }
120         }
121         return p;
122 }
123
124 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
125 {
126         struct in6_addr *p = &rt->rt6i_gateway;
127
128         if (!ipv6_addr_any(p))
129                 return (const void *) p;
130         return daddr;
131 }
132
133 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
134 {
135         struct rt6_info *rt = (struct rt6_info *) dst;
136         struct neighbour *n;
137
138         daddr = choose_neigh_daddr(rt, daddr);
139         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
140         if (n)
141                 return n;
142         return neigh_create(&nd_tbl, daddr, dst->dev);
143 }
144
145 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
146 {
147         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
148         if (!n) {
149                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
150                 if (IS_ERR(n))
151                         return PTR_ERR(n);
152         }
153         dst_set_neighbour(&rt->dst, n);
154
155         return 0;
156 }
157
158 static struct dst_ops ip6_dst_ops_template = {
159         .family                 =       AF_INET6,
160         .protocol               =       cpu_to_be16(ETH_P_IPV6),
161         .gc                     =       ip6_dst_gc,
162         .gc_thresh              =       1024,
163         .check                  =       ip6_dst_check,
164         .default_advmss         =       ip6_default_advmss,
165         .mtu                    =       ip6_mtu,
166         .cow_metrics            =       ipv6_cow_metrics,
167         .destroy                =       ip6_dst_destroy,
168         .ifdown                 =       ip6_dst_ifdown,
169         .negative_advice        =       ip6_negative_advice,
170         .link_failure           =       ip6_link_failure,
171         .update_pmtu            =       ip6_rt_update_pmtu,
172         .local_out              =       __ip6_local_out,
173         .neigh_lookup           =       ip6_neigh_lookup,
174 };
175
176 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
177 {
178         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
179
180         return mtu ? : dst->dev->mtu;
181 }
182
183 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
184 {
185 }
186
187 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
188                                          unsigned long old)
189 {
190         return NULL;
191 }
192
193 static struct dst_ops ip6_dst_blackhole_ops = {
194         .family                 =       AF_INET6,
195         .protocol               =       cpu_to_be16(ETH_P_IPV6),
196         .destroy                =       ip6_dst_destroy,
197         .check                  =       ip6_dst_check,
198         .mtu                    =       ip6_blackhole_mtu,
199         .default_advmss         =       ip6_default_advmss,
200         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
201         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
202         .neigh_lookup           =       ip6_neigh_lookup,
203 };
204
205 static const u32 ip6_template_metrics[RTAX_MAX] = {
206         [RTAX_HOPLIMIT - 1] = 255,
207 };
208
209 static struct rt6_info ip6_null_entry_template = {
210         .dst = {
211                 .__refcnt       = ATOMIC_INIT(1),
212                 .__use          = 1,
213                 .obsolete       = -1,
214                 .error          = -ENETUNREACH,
215                 .input          = ip6_pkt_discard,
216                 .output         = ip6_pkt_discard_out,
217         },
218         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
219         .rt6i_protocol  = RTPROT_KERNEL,
220         .rt6i_metric    = ~(u32) 0,
221         .rt6i_ref       = ATOMIC_INIT(1),
222 };
223
224 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
225
226 static int ip6_pkt_prohibit(struct sk_buff *skb);
227 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
228
229 static struct rt6_info ip6_prohibit_entry_template = {
230         .dst = {
231                 .__refcnt       = ATOMIC_INIT(1),
232                 .__use          = 1,
233                 .obsolete       = -1,
234                 .error          = -EACCES,
235                 .input          = ip6_pkt_prohibit,
236                 .output         = ip6_pkt_prohibit_out,
237         },
238         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
239         .rt6i_protocol  = RTPROT_KERNEL,
240         .rt6i_metric    = ~(u32) 0,
241         .rt6i_ref       = ATOMIC_INIT(1),
242 };
243
244 static struct rt6_info ip6_blk_hole_entry_template = {
245         .dst = {
246                 .__refcnt       = ATOMIC_INIT(1),
247                 .__use          = 1,
248                 .obsolete       = -1,
249                 .error          = -EINVAL,
250                 .input          = dst_discard,
251                 .output         = dst_discard,
252         },
253         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
254         .rt6i_protocol  = RTPROT_KERNEL,
255         .rt6i_metric    = ~(u32) 0,
256         .rt6i_ref       = ATOMIC_INIT(1),
257 };
258
259 #endif
260
261 /* allocate dst with ip6_dst_ops */
262 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
263                                              struct net_device *dev,
264                                              int flags)
265 {
266         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
267
268         if (rt)
269                 memset(&rt->rt6i_table, 0,
270                        sizeof(*rt) - sizeof(struct dst_entry));
271
272         return rt;
273 }
274
275 static void ip6_dst_destroy(struct dst_entry *dst)
276 {
277         struct rt6_info *rt = (struct rt6_info *)dst;
278         struct inet6_dev *idev = rt->rt6i_idev;
279         struct inet_peer *peer = rt->rt6i_peer;
280
281         if (!(rt->dst.flags & DST_HOST))
282                 dst_destroy_metrics_generic(dst);
283
284         if (idev) {
285                 rt->rt6i_idev = NULL;
286                 in6_dev_put(idev);
287         }
288         if (peer) {
289                 rt->rt6i_peer = NULL;
290                 inet_putpeer(peer);
291         }
292 }
293
294 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
295
296 static u32 rt6_peer_genid(void)
297 {
298         return atomic_read(&__rt6_peer_genid);
299 }
300
301 void rt6_bind_peer(struct rt6_info *rt, int create)
302 {
303         struct inet_peer *peer;
304
305         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
306         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
307                 inet_putpeer(peer);
308         else
309                 rt->rt6i_peer_genid = rt6_peer_genid();
310 }
311
312 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
313                            int how)
314 {
315         struct rt6_info *rt = (struct rt6_info *)dst;
316         struct inet6_dev *idev = rt->rt6i_idev;
317         struct net_device *loopback_dev =
318                 dev_net(dev)->loopback_dev;
319
320         if (dev != loopback_dev && idev && idev->dev == dev) {
321                 struct inet6_dev *loopback_idev =
322                         in6_dev_get(loopback_dev);
323                 if (loopback_idev) {
324                         rt->rt6i_idev = loopback_idev;
325                         in6_dev_put(idev);
326                 }
327         }
328 }
329
330 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
331 {
332         return (rt->rt6i_flags & RTF_EXPIRES) &&
333                 time_after(jiffies, rt->dst.expires);
334 }
335
336 static inline int rt6_need_strict(const struct in6_addr *daddr)
337 {
338         return ipv6_addr_type(daddr) &
339                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
340 }
341
342 /*
343  *      Route lookup. Any table->tb6_lock is implied.
344  */
345
346 static inline struct rt6_info *rt6_device_match(struct net *net,
347                                                     struct rt6_info *rt,
348                                                     const struct in6_addr *saddr,
349                                                     int oif,
350                                                     int flags)
351 {
352         struct rt6_info *local = NULL;
353         struct rt6_info *sprt;
354
355         if (!oif && ipv6_addr_any(saddr))
356                 goto out;
357
358         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
359                 struct net_device *dev = sprt->dst.dev;
360
361                 if (oif) {
362                         if (dev->ifindex == oif)
363                                 return sprt;
364                         if (dev->flags & IFF_LOOPBACK) {
365                                 if (!sprt->rt6i_idev ||
366                                     sprt->rt6i_idev->dev->ifindex != oif) {
367                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
368                                                 continue;
369                                         if (local && (!oif ||
370                                                       local->rt6i_idev->dev->ifindex == oif))
371                                                 continue;
372                                 }
373                                 local = sprt;
374                         }
375                 } else {
376                         if (ipv6_chk_addr(net, saddr, dev,
377                                           flags & RT6_LOOKUP_F_IFACE))
378                                 return sprt;
379                 }
380         }
381
382         if (oif) {
383                 if (local)
384                         return local;
385
386                 if (flags & RT6_LOOKUP_F_IFACE)
387                         return net->ipv6.ip6_null_entry;
388         }
389 out:
390         return rt;
391 }
392
393 #ifdef CONFIG_IPV6_ROUTER_PREF
394 static void rt6_probe(struct rt6_info *rt)
395 {
396         struct neighbour *neigh;
397         /*
398          * Okay, this does not seem to be appropriate
399          * for now, however, we need to check if it
400          * is really so; aka Router Reachability Probing.
401          *
402          * Router Reachability Probe MUST be rate-limited
403          * to no more than one per minute.
404          */
405         rcu_read_lock();
406         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
407         if (!neigh || (neigh->nud_state & NUD_VALID))
408                 goto out;
409         read_lock_bh(&neigh->lock);
410         if (!(neigh->nud_state & NUD_VALID) &&
411             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
412                 struct in6_addr mcaddr;
413                 struct in6_addr *target;
414
415                 neigh->updated = jiffies;
416                 read_unlock_bh(&neigh->lock);
417
418                 target = (struct in6_addr *)&neigh->primary_key;
419                 addrconf_addr_solict_mult(target, &mcaddr);
420                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
421         } else {
422                 read_unlock_bh(&neigh->lock);
423         }
424 out:
425         rcu_read_unlock();
426 }
427 #else
428 static inline void rt6_probe(struct rt6_info *rt)
429 {
430 }
431 #endif
432
433 /*
434  * Default Router Selection (RFC 2461 6.3.6)
435  */
436 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
437 {
438         struct net_device *dev = rt->dst.dev;
439         if (!oif || dev->ifindex == oif)
440                 return 2;
441         if ((dev->flags & IFF_LOOPBACK) &&
442             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
443                 return 1;
444         return 0;
445 }
446
447 static inline int rt6_check_neigh(struct rt6_info *rt)
448 {
449         struct neighbour *neigh;
450         int m;
451
452         rcu_read_lock();
453         neigh = dst_get_neighbour_noref(&rt->dst);
454         if (rt->rt6i_flags & RTF_NONEXTHOP ||
455             !(rt->rt6i_flags & RTF_GATEWAY))
456                 m = 1;
457         else if (neigh) {
458                 read_lock_bh(&neigh->lock);
459                 if (neigh->nud_state & NUD_VALID)
460                         m = 2;
461 #ifdef CONFIG_IPV6_ROUTER_PREF
462                 else if (neigh->nud_state & NUD_FAILED)
463                         m = 0;
464 #endif
465                 else
466                         m = 1;
467                 read_unlock_bh(&neigh->lock);
468         } else
469                 m = 0;
470         rcu_read_unlock();
471         return m;
472 }
473
474 static int rt6_score_route(struct rt6_info *rt, int oif,
475                            int strict)
476 {
477         int m, n;
478
479         m = rt6_check_dev(rt, oif);
480         if (!m && (strict & RT6_LOOKUP_F_IFACE))
481                 return -1;
482 #ifdef CONFIG_IPV6_ROUTER_PREF
483         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
484 #endif
485         n = rt6_check_neigh(rt);
486         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
487                 return -1;
488         return m;
489 }
490
491 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
492                                    int *mpri, struct rt6_info *match)
493 {
494         int m;
495
496         if (rt6_check_expired(rt))
497                 goto out;
498
499         m = rt6_score_route(rt, oif, strict);
500         if (m < 0)
501                 goto out;
502
503         if (m > *mpri) {
504                 if (strict & RT6_LOOKUP_F_REACHABLE)
505                         rt6_probe(match);
506                 *mpri = m;
507                 match = rt;
508         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
509                 rt6_probe(rt);
510         }
511
512 out:
513         return match;
514 }
515
516 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
517                                      struct rt6_info *rr_head,
518                                      u32 metric, int oif, int strict)
519 {
520         struct rt6_info *rt, *match;
521         int mpri = -1;
522
523         match = NULL;
524         for (rt = rr_head; rt && rt->rt6i_metric == metric;
525              rt = rt->dst.rt6_next)
526                 match = find_match(rt, oif, strict, &mpri, match);
527         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
528              rt = rt->dst.rt6_next)
529                 match = find_match(rt, oif, strict, &mpri, match);
530
531         return match;
532 }
533
534 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
535 {
536         struct rt6_info *match, *rt0;
537         struct net *net;
538
539         rt0 = fn->rr_ptr;
540         if (!rt0)
541                 fn->rr_ptr = rt0 = fn->leaf;
542
543         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
544
545         if (!match &&
546             (strict & RT6_LOOKUP_F_REACHABLE)) {
547                 struct rt6_info *next = rt0->dst.rt6_next;
548
549                 /* no entries matched; do round-robin */
550                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
551                         next = fn->leaf;
552
553                 if (next != rt0)
554                         fn->rr_ptr = next;
555         }
556
557         net = dev_net(rt0->dst.dev);
558         return match ? match : net->ipv6.ip6_null_entry;
559 }
560
561 #ifdef CONFIG_IPV6_ROUTE_INFO
562 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
563                   const struct in6_addr *gwaddr)
564 {
565         struct net *net = dev_net(dev);
566         struct route_info *rinfo = (struct route_info *) opt;
567         struct in6_addr prefix_buf, *prefix;
568         unsigned int pref;
569         unsigned long lifetime;
570         struct rt6_info *rt;
571
572         if (len < sizeof(struct route_info)) {
573                 return -EINVAL;
574         }
575
576         /* Sanity check for prefix_len and length */
577         if (rinfo->length > 3) {
578                 return -EINVAL;
579         } else if (rinfo->prefix_len > 128) {
580                 return -EINVAL;
581         } else if (rinfo->prefix_len > 64) {
582                 if (rinfo->length < 2) {
583                         return -EINVAL;
584                 }
585         } else if (rinfo->prefix_len > 0) {
586                 if (rinfo->length < 1) {
587                         return -EINVAL;
588                 }
589         }
590
591         pref = rinfo->route_pref;
592         if (pref == ICMPV6_ROUTER_PREF_INVALID)
593                 return -EINVAL;
594
595         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
596
597         if (rinfo->length == 3)
598                 prefix = (struct in6_addr *)rinfo->prefix;
599         else {
600                 /* this function is safe */
601                 ipv6_addr_prefix(&prefix_buf,
602                                  (struct in6_addr *)rinfo->prefix,
603                                  rinfo->prefix_len);
604                 prefix = &prefix_buf;
605         }
606
607         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
608                                 dev->ifindex);
609
610         if (rt && !lifetime) {
611                 ip6_del_rt(rt);
612                 rt = NULL;
613         }
614
615         if (!rt && lifetime)
616                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
617                                         pref);
618         else if (rt)
619                 rt->rt6i_flags = RTF_ROUTEINFO |
620                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
621
622         if (rt) {
623                 if (!addrconf_finite_timeout(lifetime)) {
624                         rt->rt6i_flags &= ~RTF_EXPIRES;
625                 } else {
626                         rt->dst.expires = jiffies + HZ * lifetime;
627                         rt->rt6i_flags |= RTF_EXPIRES;
628                 }
629                 dst_release(&rt->dst);
630         }
631         return 0;
632 }
633 #endif
634
635 #define BACKTRACK(__net, saddr)                 \
636 do { \
637         if (rt == __net->ipv6.ip6_null_entry) { \
638                 struct fib6_node *pn; \
639                 while (1) { \
640                         if (fn->fn_flags & RTN_TL_ROOT) \
641                                 goto out; \
642                         pn = fn->parent; \
643                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
644                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
645                         else \
646                                 fn = pn; \
647                         if (fn->fn_flags & RTN_RTINFO) \
648                                 goto restart; \
649                 } \
650         } \
651 } while (0)
652
653 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
654                                              struct fib6_table *table,
655                                              struct flowi6 *fl6, int flags)
656 {
657         struct fib6_node *fn;
658         struct rt6_info *rt;
659
660         read_lock_bh(&table->tb6_lock);
661         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
662 restart:
663         rt = fn->leaf;
664         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
665         BACKTRACK(net, &fl6->saddr);
666 out:
667         dst_use(&rt->dst, jiffies);
668         read_unlock_bh(&table->tb6_lock);
669         return rt;
670
671 }
672
673 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
674                                     int flags)
675 {
676         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
677 }
678 EXPORT_SYMBOL_GPL(ip6_route_lookup);
679
680 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
681                             const struct in6_addr *saddr, int oif, int strict)
682 {
683         struct flowi6 fl6 = {
684                 .flowi6_oif = oif,
685                 .daddr = *daddr,
686         };
687         struct dst_entry *dst;
688         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
689
690         if (saddr) {
691                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
692                 flags |= RT6_LOOKUP_F_HAS_SADDR;
693         }
694
695         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
696         if (dst->error == 0)
697                 return (struct rt6_info *) dst;
698
699         dst_release(dst);
700
701         return NULL;
702 }
703
704 EXPORT_SYMBOL(rt6_lookup);
705
706 /* ip6_ins_rt is called with FREE table->tb6_lock.
707    It takes new route entry, the addition fails by any reason the
708    route is freed. In any case, if caller does not hold it, it may
709    be destroyed.
710  */
711
712 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
713 {
714         int err;
715         struct fib6_table *table;
716
717         table = rt->rt6i_table;
718         write_lock_bh(&table->tb6_lock);
719         err = fib6_add(&table->tb6_root, rt, info);
720         write_unlock_bh(&table->tb6_lock);
721
722         return err;
723 }
724
725 int ip6_ins_rt(struct rt6_info *rt)
726 {
727         struct nl_info info = {
728                 .nl_net = dev_net(rt->dst.dev),
729         };
730         return __ip6_ins_rt(rt, &info);
731 }
732
733 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
734                                       const struct in6_addr *daddr,
735                                       const struct in6_addr *saddr)
736 {
737         struct rt6_info *rt;
738
739         /*
740          *      Clone the route.
741          */
742
743         rt = ip6_rt_copy(ort, daddr);
744
745         if (rt) {
746                 int attempts = !in_softirq();
747
748                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
749                         if (ort->rt6i_dst.plen != 128 &&
750                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
751                                 rt->rt6i_flags |= RTF_ANYCAST;
752                         rt->rt6i_gateway = *daddr;
753                 }
754
755                 rt->rt6i_flags |= RTF_CACHE;
756
757 #ifdef CONFIG_IPV6_SUBTREES
758                 if (rt->rt6i_src.plen && saddr) {
759                         rt->rt6i_src.addr = *saddr;
760                         rt->rt6i_src.plen = 128;
761                 }
762 #endif
763
764         retry:
765                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
766                         struct net *net = dev_net(rt->dst.dev);
767                         int saved_rt_min_interval =
768                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
769                         int saved_rt_elasticity =
770                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
771
772                         if (attempts-- > 0) {
773                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
774                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
775
776                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
777
778                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
779                                         saved_rt_elasticity;
780                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
781                                         saved_rt_min_interval;
782                                 goto retry;
783                         }
784
785                         if (net_ratelimit())
786                                 printk(KERN_WARNING
787                                        "ipv6: Neighbour table overflow.\n");
788                         dst_free(&rt->dst);
789                         return NULL;
790                 }
791         }
792
793         return rt;
794 }
795
796 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
797                                         const struct in6_addr *daddr)
798 {
799         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
800
801         if (rt) {
802                 rt->rt6i_flags |= RTF_CACHE;
803                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
804         }
805         return rt;
806 }
807
808 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
809                                       struct flowi6 *fl6, int flags)
810 {
811         struct fib6_node *fn;
812         struct rt6_info *rt, *nrt;
813         int strict = 0;
814         int attempts = 3;
815         int err;
816         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
817
818         strict |= flags & RT6_LOOKUP_F_IFACE;
819
820 relookup:
821         read_lock_bh(&table->tb6_lock);
822
823 restart_2:
824         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
825
826 restart:
827         rt = rt6_select(fn, oif, strict | reachable);
828
829         BACKTRACK(net, &fl6->saddr);
830         if (rt == net->ipv6.ip6_null_entry ||
831             rt->rt6i_flags & RTF_CACHE)
832                 goto out;
833
834         dst_hold(&rt->dst);
835         read_unlock_bh(&table->tb6_lock);
836
837         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
838                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
839         else if (!(rt->dst.flags & DST_HOST))
840                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
841         else
842                 goto out2;
843
844         dst_release(&rt->dst);
845         rt = nrt ? : net->ipv6.ip6_null_entry;
846
847         dst_hold(&rt->dst);
848         if (nrt) {
849                 err = ip6_ins_rt(nrt);
850                 if (!err)
851                         goto out2;
852         }
853
854         if (--attempts <= 0)
855                 goto out2;
856
857         /*
858          * Race condition! In the gap, when table->tb6_lock was
859          * released someone could insert this route.  Relookup.
860          */
861         dst_release(&rt->dst);
862         goto relookup;
863
864 out:
865         if (reachable) {
866                 reachable = 0;
867                 goto restart_2;
868         }
869         dst_hold(&rt->dst);
870         read_unlock_bh(&table->tb6_lock);
871 out2:
872         rt->dst.lastuse = jiffies;
873         rt->dst.__use++;
874
875         return rt;
876 }
877
878 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
879                                             struct flowi6 *fl6, int flags)
880 {
881         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
882 }
883
884 static struct dst_entry *ip6_route_input_lookup(struct net *net,
885                                                 struct net_device *dev,
886                                                 struct flowi6 *fl6, int flags)
887 {
888         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
889                 flags |= RT6_LOOKUP_F_IFACE;
890
891         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
892 }
893
894 void ip6_route_input(struct sk_buff *skb)
895 {
896         const struct ipv6hdr *iph = ipv6_hdr(skb);
897         struct net *net = dev_net(skb->dev);
898         int flags = RT6_LOOKUP_F_HAS_SADDR;
899         struct flowi6 fl6 = {
900                 .flowi6_iif = skb->dev->ifindex,
901                 .daddr = iph->daddr,
902                 .saddr = iph->saddr,
903                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
904                 .flowi6_mark = skb->mark,
905                 .flowi6_proto = iph->nexthdr,
906         };
907
908         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
909 }
910
911 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
912                                              struct flowi6 *fl6, int flags)
913 {
914         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
915 }
916
917 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
918                                     struct flowi6 *fl6)
919 {
920         int flags = 0;
921
922         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
923                 flags |= RT6_LOOKUP_F_IFACE;
924
925         if (!ipv6_addr_any(&fl6->saddr))
926                 flags |= RT6_LOOKUP_F_HAS_SADDR;
927         else if (sk)
928                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
929
930         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
931 }
932
933 EXPORT_SYMBOL(ip6_route_output);
934
935 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
936 {
937         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
938         struct dst_entry *new = NULL;
939
940         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
941         if (rt) {
942                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
943
944                 new = &rt->dst;
945
946                 new->__use = 1;
947                 new->input = dst_discard;
948                 new->output = dst_discard;
949
950                 if (dst_metrics_read_only(&ort->dst))
951                         new->_metrics = ort->dst._metrics;
952                 else
953                         dst_copy_metrics(new, &ort->dst);
954                 rt->rt6i_idev = ort->rt6i_idev;
955                 if (rt->rt6i_idev)
956                         in6_dev_hold(rt->rt6i_idev);
957                 rt->dst.expires = 0;
958
959                 rt->rt6i_gateway = ort->rt6i_gateway;
960                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
961                 rt->rt6i_metric = 0;
962
963                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
964 #ifdef CONFIG_IPV6_SUBTREES
965                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
966 #endif
967
968                 dst_free(new);
969         }
970
971         dst_release(dst_orig);
972         return new ? new : ERR_PTR(-ENOMEM);
973 }
974
975 /*
976  *      Destination cache support functions
977  */
978
979 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
980 {
981         struct rt6_info *rt;
982
983         rt = (struct rt6_info *) dst;
984
985         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
986                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
987                         if (!rt->rt6i_peer)
988                                 rt6_bind_peer(rt, 0);
989                         rt->rt6i_peer_genid = rt6_peer_genid();
990                 }
991                 return dst;
992         }
993         return NULL;
994 }
995
996 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
997 {
998         struct rt6_info *rt = (struct rt6_info *) dst;
999
1000         if (rt) {
1001                 if (rt->rt6i_flags & RTF_CACHE) {
1002                         if (rt6_check_expired(rt)) {
1003                                 ip6_del_rt(rt);
1004                                 dst = NULL;
1005                         }
1006                 } else {
1007                         dst_release(dst);
1008                         dst = NULL;
1009                 }
1010         }
1011         return dst;
1012 }
1013
1014 static void ip6_link_failure(struct sk_buff *skb)
1015 {
1016         struct rt6_info *rt;
1017
1018         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1019
1020         rt = (struct rt6_info *) skb_dst(skb);
1021         if (rt) {
1022                 if (rt->rt6i_flags & RTF_CACHE) {
1023                         dst_set_expires(&rt->dst, 0);
1024                         rt->rt6i_flags |= RTF_EXPIRES;
1025                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1026                         rt->rt6i_node->fn_sernum = -1;
1027         }
1028 }
1029
1030 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1031 {
1032         struct rt6_info *rt6 = (struct rt6_info*)dst;
1033
1034         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1035                 rt6->rt6i_flags |= RTF_MODIFIED;
1036                 if (mtu < IPV6_MIN_MTU) {
1037                         u32 features = dst_metric(dst, RTAX_FEATURES);
1038                         mtu = IPV6_MIN_MTU;
1039                         features |= RTAX_FEATURE_ALLFRAG;
1040                         dst_metric_set(dst, RTAX_FEATURES, features);
1041                 }
1042                 dst_metric_set(dst, RTAX_MTU, mtu);
1043         }
1044 }
1045
1046 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1047 {
1048         struct net_device *dev = dst->dev;
1049         unsigned int mtu = dst_mtu(dst);
1050         struct net *net = dev_net(dev);
1051
1052         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1053
1054         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1055                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1056
1057         /*
1058          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1059          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1060          * IPV6_MAXPLEN is also valid and means: "any MSS,
1061          * rely only on pmtu discovery"
1062          */
1063         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1064                 mtu = IPV6_MAXPLEN;
1065         return mtu;
1066 }
1067
1068 static unsigned int ip6_mtu(const struct dst_entry *dst)
1069 {
1070         struct inet6_dev *idev;
1071         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1072
1073         if (mtu)
1074                 return mtu;
1075
1076         mtu = IPV6_MIN_MTU;
1077
1078         rcu_read_lock();
1079         idev = __in6_dev_get(dst->dev);
1080         if (idev)
1081                 mtu = idev->cnf.mtu6;
1082         rcu_read_unlock();
1083
1084         return mtu;
1085 }
1086
1087 static struct dst_entry *icmp6_dst_gc_list;
1088 static DEFINE_SPINLOCK(icmp6_dst_lock);
1089
1090 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1091                                   struct neighbour *neigh,
1092                                   struct flowi6 *fl6)
1093 {
1094         struct dst_entry *dst;
1095         struct rt6_info *rt;
1096         struct inet6_dev *idev = in6_dev_get(dev);
1097         struct net *net = dev_net(dev);
1098
1099         if (unlikely(!idev))
1100                 return ERR_PTR(-ENODEV);
1101
1102         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1103         if (unlikely(!rt)) {
1104                 in6_dev_put(idev);
1105                 dst = ERR_PTR(-ENOMEM);
1106                 goto out;
1107         }
1108
1109         if (neigh)
1110                 neigh_hold(neigh);
1111         else {
1112                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1113                 if (IS_ERR(neigh)) {
1114                         in6_dev_put(idev);
1115                         dst_free(&rt->dst);
1116                         return ERR_CAST(neigh);
1117                 }
1118         }
1119
1120         rt->dst.flags |= DST_HOST;
1121         rt->dst.output  = ip6_output;
1122         dst_set_neighbour(&rt->dst, neigh);
1123         atomic_set(&rt->dst.__refcnt, 1);
1124         rt->rt6i_dst.addr = fl6->daddr;
1125         rt->rt6i_dst.plen = 128;
1126         rt->rt6i_idev     = idev;
1127         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1128
1129         spin_lock_bh(&icmp6_dst_lock);
1130         rt->dst.next = icmp6_dst_gc_list;
1131         icmp6_dst_gc_list = &rt->dst;
1132         spin_unlock_bh(&icmp6_dst_lock);
1133
1134         fib6_force_start_gc(net);
1135
1136         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1137
1138 out:
1139         return dst;
1140 }
1141
1142 int icmp6_dst_gc(void)
1143 {
1144         struct dst_entry *dst, **pprev;
1145         int more = 0;
1146
1147         spin_lock_bh(&icmp6_dst_lock);
1148         pprev = &icmp6_dst_gc_list;
1149
1150         while ((dst = *pprev) != NULL) {
1151                 if (!atomic_read(&dst->__refcnt)) {
1152                         *pprev = dst->next;
1153                         dst_free(dst);
1154                 } else {
1155                         pprev = &dst->next;
1156                         ++more;
1157                 }
1158         }
1159
1160         spin_unlock_bh(&icmp6_dst_lock);
1161
1162         return more;
1163 }
1164
1165 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1166                             void *arg)
1167 {
1168         struct dst_entry *dst, **pprev;
1169
1170         spin_lock_bh(&icmp6_dst_lock);
1171         pprev = &icmp6_dst_gc_list;
1172         while ((dst = *pprev) != NULL) {
1173                 struct rt6_info *rt = (struct rt6_info *) dst;
1174                 if (func(rt, arg)) {
1175                         *pprev = dst->next;
1176                         dst_free(dst);
1177                 } else {
1178                         pprev = &dst->next;
1179                 }
1180         }
1181         spin_unlock_bh(&icmp6_dst_lock);
1182 }
1183
1184 static int ip6_dst_gc(struct dst_ops *ops)
1185 {
1186         unsigned long now = jiffies;
1187         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1188         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1189         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1190         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1191         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1192         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1193         int entries;
1194
1195         entries = dst_entries_get_fast(ops);
1196         if (time_after(rt_last_gc + rt_min_interval, now) &&
1197             entries <= rt_max_size)
1198                 goto out;
1199
1200         net->ipv6.ip6_rt_gc_expire++;
1201         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1202         net->ipv6.ip6_rt_last_gc = now;
1203         entries = dst_entries_get_slow(ops);
1204         if (entries < ops->gc_thresh)
1205                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1206 out:
1207         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1208         return entries > rt_max_size;
1209 }
1210
1211 /* Clean host part of a prefix. Not necessary in radix tree,
1212    but results in cleaner routing tables.
1213
1214    Remove it only when all the things will work!
1215  */
1216
1217 int ip6_dst_hoplimit(struct dst_entry *dst)
1218 {
1219         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1220         if (hoplimit == 0) {
1221                 struct net_device *dev = dst->dev;
1222                 struct inet6_dev *idev;
1223
1224                 rcu_read_lock();
1225                 idev = __in6_dev_get(dev);
1226                 if (idev)
1227                         hoplimit = idev->cnf.hop_limit;
1228                 else
1229                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1230                 rcu_read_unlock();
1231         }
1232         return hoplimit;
1233 }
1234 EXPORT_SYMBOL(ip6_dst_hoplimit);
1235
1236 /*
1237  *
1238  */
1239
1240 int ip6_route_add(struct fib6_config *cfg)
1241 {
1242         int err;
1243         struct net *net = cfg->fc_nlinfo.nl_net;
1244         struct rt6_info *rt = NULL;
1245         struct net_device *dev = NULL;
1246         struct inet6_dev *idev = NULL;
1247         struct fib6_table *table;
1248         int addr_type;
1249
1250         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1251                 return -EINVAL;
1252 #ifndef CONFIG_IPV6_SUBTREES
1253         if (cfg->fc_src_len)
1254                 return -EINVAL;
1255 #endif
1256         if (cfg->fc_ifindex) {
1257                 err = -ENODEV;
1258                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1259                 if (!dev)
1260                         goto out;
1261                 idev = in6_dev_get(dev);
1262                 if (!idev)
1263                         goto out;
1264         }
1265
1266         if (cfg->fc_metric == 0)
1267                 cfg->fc_metric = IP6_RT_PRIO_USER;
1268
1269         err = -ENOBUFS;
1270         if (cfg->fc_nlinfo.nlh &&
1271             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1272                 table = fib6_get_table(net, cfg->fc_table);
1273                 if (!table) {
1274                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1275                         table = fib6_new_table(net, cfg->fc_table);
1276                 }
1277         } else {
1278                 table = fib6_new_table(net, cfg->fc_table);
1279         }
1280
1281         if (!table)
1282                 goto out;
1283
1284         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1285
1286         if (!rt) {
1287                 err = -ENOMEM;
1288                 goto out;
1289         }
1290
1291         rt->dst.obsolete = -1;
1292         rt->dst.expires = (cfg->fc_flags & RTF_EXPIRES) ?
1293                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1294                                 0;
1295
1296         if (cfg->fc_protocol == RTPROT_UNSPEC)
1297                 cfg->fc_protocol = RTPROT_BOOT;
1298         rt->rt6i_protocol = cfg->fc_protocol;
1299
1300         addr_type = ipv6_addr_type(&cfg->fc_dst);
1301
1302         if (addr_type & IPV6_ADDR_MULTICAST)
1303                 rt->dst.input = ip6_mc_input;
1304         else if (cfg->fc_flags & RTF_LOCAL)
1305                 rt->dst.input = ip6_input;
1306         else
1307                 rt->dst.input = ip6_forward;
1308
1309         rt->dst.output = ip6_output;
1310
1311         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1312         rt->rt6i_dst.plen = cfg->fc_dst_len;
1313         if (rt->rt6i_dst.plen == 128)
1314                rt->dst.flags |= DST_HOST;
1315
1316         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1317                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1318                 if (!metrics) {
1319                         err = -ENOMEM;
1320                         goto out;
1321                 }
1322                 dst_init_metrics(&rt->dst, metrics, 0);
1323         }
1324 #ifdef CONFIG_IPV6_SUBTREES
1325         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1326         rt->rt6i_src.plen = cfg->fc_src_len;
1327 #endif
1328
1329         rt->rt6i_metric = cfg->fc_metric;
1330
1331         /* We cannot add true routes via loopback here,
1332            they would result in kernel looping; promote them to reject routes
1333          */
1334         if ((cfg->fc_flags & RTF_REJECT) ||
1335             (dev && (dev->flags & IFF_LOOPBACK) &&
1336              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1337              !(cfg->fc_flags & RTF_LOCAL))) {
1338                 /* hold loopback dev/idev if we haven't done so. */
1339                 if (dev != net->loopback_dev) {
1340                         if (dev) {
1341                                 dev_put(dev);
1342                                 in6_dev_put(idev);
1343                         }
1344                         dev = net->loopback_dev;
1345                         dev_hold(dev);
1346                         idev = in6_dev_get(dev);
1347                         if (!idev) {
1348                                 err = -ENODEV;
1349                                 goto out;
1350                         }
1351                 }
1352                 rt->dst.output = ip6_pkt_discard_out;
1353                 rt->dst.input = ip6_pkt_discard;
1354                 rt->dst.error = -ENETUNREACH;
1355                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1356                 goto install_route;
1357         }
1358
1359         if (cfg->fc_flags & RTF_GATEWAY) {
1360                 const struct in6_addr *gw_addr;
1361                 int gwa_type;
1362
1363                 gw_addr = &cfg->fc_gateway;
1364                 rt->rt6i_gateway = *gw_addr;
1365                 gwa_type = ipv6_addr_type(gw_addr);
1366
1367                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1368                         struct rt6_info *grt;
1369
1370                         /* IPv6 strictly inhibits using not link-local
1371                            addresses as nexthop address.
1372                            Otherwise, router will not able to send redirects.
1373                            It is very good, but in some (rare!) circumstances
1374                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1375                            some exceptions. --ANK
1376                          */
1377                         err = -EINVAL;
1378                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1379                                 goto out;
1380
1381                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1382
1383                         err = -EHOSTUNREACH;
1384                         if (!grt)
1385                                 goto out;
1386                         if (dev) {
1387                                 if (dev != grt->dst.dev) {
1388                                         dst_release(&grt->dst);
1389                                         goto out;
1390                                 }
1391                         } else {
1392                                 dev = grt->dst.dev;
1393                                 idev = grt->rt6i_idev;
1394                                 dev_hold(dev);
1395                                 in6_dev_hold(grt->rt6i_idev);
1396                         }
1397                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1398                                 err = 0;
1399                         dst_release(&grt->dst);
1400
1401                         if (err)
1402                                 goto out;
1403                 }
1404                 err = -EINVAL;
1405                 if (!dev || (dev->flags & IFF_LOOPBACK))
1406                         goto out;
1407         }
1408
1409         err = -ENODEV;
1410         if (!dev)
1411                 goto out;
1412
1413         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1414                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1415                         err = -EINVAL;
1416                         goto out;
1417                 }
1418                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1419                 rt->rt6i_prefsrc.plen = 128;
1420         } else
1421                 rt->rt6i_prefsrc.plen = 0;
1422
1423         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1424                 err = rt6_bind_neighbour(rt, dev);
1425                 if (err)
1426                         goto out;
1427         }
1428
1429         rt->rt6i_flags = cfg->fc_flags;
1430
1431 install_route:
1432         if (cfg->fc_mx) {
1433                 struct nlattr *nla;
1434                 int remaining;
1435
1436                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1437                         int type = nla_type(nla);
1438
1439                         if (type) {
1440                                 if (type > RTAX_MAX) {
1441                                         err = -EINVAL;
1442                                         goto out;
1443                                 }
1444
1445                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1446                         }
1447                 }
1448         }
1449
1450         rt->dst.dev = dev;
1451         rt->rt6i_idev = idev;
1452         rt->rt6i_table = table;
1453
1454         cfg->fc_nlinfo.nl_net = dev_net(dev);
1455
1456         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1457
1458 out:
1459         if (dev)
1460                 dev_put(dev);
1461         if (idev)
1462                 in6_dev_put(idev);
1463         if (rt)
1464                 dst_free(&rt->dst);
1465         return err;
1466 }
1467
1468 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1469 {
1470         int err;
1471         struct fib6_table *table;
1472         struct net *net = dev_net(rt->dst.dev);
1473
1474         if (rt == net->ipv6.ip6_null_entry)
1475                 return -ENOENT;
1476
1477         table = rt->rt6i_table;
1478         write_lock_bh(&table->tb6_lock);
1479
1480         err = fib6_del(rt, info);
1481         dst_release(&rt->dst);
1482
1483         write_unlock_bh(&table->tb6_lock);
1484
1485         return err;
1486 }
1487
1488 int ip6_del_rt(struct rt6_info *rt)
1489 {
1490         struct nl_info info = {
1491                 .nl_net = dev_net(rt->dst.dev),
1492         };
1493         return __ip6_del_rt(rt, &info);
1494 }
1495
1496 static int ip6_route_del(struct fib6_config *cfg)
1497 {
1498         struct fib6_table *table;
1499         struct fib6_node *fn;
1500         struct rt6_info *rt;
1501         int err = -ESRCH;
1502
1503         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1504         if (!table)
1505                 return err;
1506
1507         read_lock_bh(&table->tb6_lock);
1508
1509         fn = fib6_locate(&table->tb6_root,
1510                          &cfg->fc_dst, cfg->fc_dst_len,
1511                          &cfg->fc_src, cfg->fc_src_len);
1512
1513         if (fn) {
1514                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1515                         if (cfg->fc_ifindex &&
1516                             (!rt->dst.dev ||
1517                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1518                                 continue;
1519                         if (cfg->fc_flags & RTF_GATEWAY &&
1520                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1521                                 continue;
1522                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1523                                 continue;
1524                         dst_hold(&rt->dst);
1525                         read_unlock_bh(&table->tb6_lock);
1526
1527                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1528                 }
1529         }
1530         read_unlock_bh(&table->tb6_lock);
1531
1532         return err;
1533 }
1534
1535 /*
1536  *      Handle redirects
1537  */
1538 struct ip6rd_flowi {
1539         struct flowi6 fl6;
1540         struct in6_addr gateway;
1541 };
1542
1543 static struct rt6_info *__ip6_route_redirect(struct net *net,
1544                                              struct fib6_table *table,
1545                                              struct flowi6 *fl6,
1546                                              int flags)
1547 {
1548         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1549         struct rt6_info *rt;
1550         struct fib6_node *fn;
1551
1552         /*
1553          * Get the "current" route for this destination and
1554          * check if the redirect has come from approriate router.
1555          *
1556          * RFC 2461 specifies that redirects should only be
1557          * accepted if they come from the nexthop to the target.
1558          * Due to the way the routes are chosen, this notion
1559          * is a bit fuzzy and one might need to check all possible
1560          * routes.
1561          */
1562
1563         read_lock_bh(&table->tb6_lock);
1564         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1565 restart:
1566         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1567                 /*
1568                  * Current route is on-link; redirect is always invalid.
1569                  *
1570                  * Seems, previous statement is not true. It could
1571                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1572                  * But then router serving it might decide, that we should
1573                  * know truth 8)8) --ANK (980726).
1574                  */
1575                 if (rt6_check_expired(rt))
1576                         continue;
1577                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1578                         continue;
1579                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1580                         continue;
1581                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1582                         continue;
1583                 break;
1584         }
1585
1586         if (!rt)
1587                 rt = net->ipv6.ip6_null_entry;
1588         BACKTRACK(net, &fl6->saddr);
1589 out:
1590         dst_hold(&rt->dst);
1591
1592         read_unlock_bh(&table->tb6_lock);
1593
1594         return rt;
1595 };
1596
1597 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1598                                            const struct in6_addr *src,
1599                                            const struct in6_addr *gateway,
1600                                            struct net_device *dev)
1601 {
1602         int flags = RT6_LOOKUP_F_HAS_SADDR;
1603         struct net *net = dev_net(dev);
1604         struct ip6rd_flowi rdfl = {
1605                 .fl6 = {
1606                         .flowi6_oif = dev->ifindex,
1607                         .daddr = *dest,
1608                         .saddr = *src,
1609                 },
1610         };
1611
1612         rdfl.gateway = *gateway;
1613
1614         if (rt6_need_strict(dest))
1615                 flags |= RT6_LOOKUP_F_IFACE;
1616
1617         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1618                                                    flags, __ip6_route_redirect);
1619 }
1620
1621 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1622                   const struct in6_addr *saddr,
1623                   struct neighbour *neigh, u8 *lladdr, int on_link)
1624 {
1625         struct rt6_info *rt, *nrt = NULL;
1626         struct netevent_redirect netevent;
1627         struct net *net = dev_net(neigh->dev);
1628
1629         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1630
1631         if (rt == net->ipv6.ip6_null_entry) {
1632                 if (net_ratelimit())
1633                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1634                                "for redirect target\n");
1635                 goto out;
1636         }
1637
1638         /*
1639          *      We have finally decided to accept it.
1640          */
1641
1642         neigh_update(neigh, lladdr, NUD_STALE,
1643                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1644                      NEIGH_UPDATE_F_OVERRIDE|
1645                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1646                                      NEIGH_UPDATE_F_ISROUTER))
1647                      );
1648
1649         /*
1650          * Redirect received -> path was valid.
1651          * Look, redirects are sent only in response to data packets,
1652          * so that this nexthop apparently is reachable. --ANK
1653          */
1654         dst_confirm(&rt->dst);
1655
1656         /* Duplicate redirect: silently ignore. */
1657         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1658                 goto out;
1659
1660         nrt = ip6_rt_copy(rt, dest);
1661         if (!nrt)
1662                 goto out;
1663
1664         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1665         if (on_link)
1666                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1667
1668         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1669         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1670
1671         if (ip6_ins_rt(nrt))
1672                 goto out;
1673
1674         netevent.old = &rt->dst;
1675         netevent.new = &nrt->dst;
1676         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1677
1678         if (rt->rt6i_flags & RTF_CACHE) {
1679                 ip6_del_rt(rt);
1680                 return;
1681         }
1682
1683 out:
1684         dst_release(&rt->dst);
1685 }
1686
1687 /*
1688  *      Handle ICMP "packet too big" messages
1689  *      i.e. Path MTU discovery
1690  */
1691
1692 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1693                              struct net *net, u32 pmtu, int ifindex)
1694 {
1695         struct rt6_info *rt, *nrt;
1696         int allfrag = 0;
1697 again:
1698         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1699         if (!rt)
1700                 return;
1701
1702         if (rt6_check_expired(rt)) {
1703                 ip6_del_rt(rt);
1704                 goto again;
1705         }
1706
1707         if (pmtu >= dst_mtu(&rt->dst))
1708                 goto out;
1709
1710         if (pmtu < IPV6_MIN_MTU) {
1711                 /*
1712                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1713                  * MTU (1280) and a fragment header should always be included
1714                  * after a node receiving Too Big message reporting PMTU is
1715                  * less than the IPv6 Minimum Link MTU.
1716                  */
1717                 pmtu = IPV6_MIN_MTU;
1718                 allfrag = 1;
1719         }
1720
1721         /* New mtu received -> path was valid.
1722            They are sent only in response to data packets,
1723            so that this nexthop apparently is reachable. --ANK
1724          */
1725         dst_confirm(&rt->dst);
1726
1727         /* Host route. If it is static, it would be better
1728            not to override it, but add new one, so that
1729            when cache entry will expire old pmtu
1730            would return automatically.
1731          */
1732         if (rt->rt6i_flags & RTF_CACHE) {
1733                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1734                 if (allfrag) {
1735                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1736                         features |= RTAX_FEATURE_ALLFRAG;
1737                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1738                 }
1739                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1740                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1741                 goto out;
1742         }
1743
1744         /* Network route.
1745            Two cases are possible:
1746            1. It is connected route. Action: COW
1747            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1748          */
1749         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1750                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1751         else
1752                 nrt = rt6_alloc_clone(rt, daddr);
1753
1754         if (nrt) {
1755                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1756                 if (allfrag) {
1757                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1758                         features |= RTAX_FEATURE_ALLFRAG;
1759                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1760                 }
1761
1762                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1763                  * happened within 5 mins, the recommended timer is 10 mins.
1764                  * Here this route expiration time is set to ip6_rt_mtu_expires
1765                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1766                  * and detecting PMTU increase will be automatically happened.
1767                  */
1768                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1769                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1770
1771                 ip6_ins_rt(nrt);
1772         }
1773 out:
1774         dst_release(&rt->dst);
1775 }
1776
1777 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1778                         struct net_device *dev, u32 pmtu)
1779 {
1780         struct net *net = dev_net(dev);
1781
1782         /*
1783          * RFC 1981 states that a node "MUST reduce the size of the packets it
1784          * is sending along the path" that caused the Packet Too Big message.
1785          * Since it's not possible in the general case to determine which
1786          * interface was used to send the original packet, we update the MTU
1787          * on the interface that will be used to send future packets. We also
1788          * update the MTU on the interface that received the Packet Too Big in
1789          * case the original packet was forced out that interface with
1790          * SO_BINDTODEVICE or similar. This is the next best thing to the
1791          * correct behaviour, which would be to update the MTU on all
1792          * interfaces.
1793          */
1794         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1795         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1796 }
1797
1798 /*
1799  *      Misc support functions
1800  */
1801
1802 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1803                                     const struct in6_addr *dest)
1804 {
1805         struct net *net = dev_net(ort->dst.dev);
1806         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1807                                             ort->dst.dev, 0);
1808
1809         if (rt) {
1810                 rt->dst.input = ort->dst.input;
1811                 rt->dst.output = ort->dst.output;
1812                 rt->dst.flags |= DST_HOST;
1813
1814                 rt->rt6i_dst.addr = *dest;
1815                 rt->rt6i_dst.plen = 128;
1816                 dst_copy_metrics(&rt->dst, &ort->dst);
1817                 rt->dst.error = ort->dst.error;
1818                 rt->rt6i_idev = ort->rt6i_idev;
1819                 if (rt->rt6i_idev)
1820                         in6_dev_hold(rt->rt6i_idev);
1821                 rt->dst.lastuse = jiffies;
1822                 rt->dst.expires = 0;
1823
1824                 rt->rt6i_gateway = ort->rt6i_gateway;
1825                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1826                 rt->rt6i_metric = 0;
1827
1828 #ifdef CONFIG_IPV6_SUBTREES
1829                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1830 #endif
1831                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1832                 rt->rt6i_table = ort->rt6i_table;
1833         }
1834         return rt;
1835 }
1836
1837 #ifdef CONFIG_IPV6_ROUTE_INFO
1838 static struct rt6_info *rt6_get_route_info(struct net *net,
1839                                            const struct in6_addr *prefix, int prefixlen,
1840                                            const struct in6_addr *gwaddr, int ifindex)
1841 {
1842         struct fib6_node *fn;
1843         struct rt6_info *rt = NULL;
1844         struct fib6_table *table;
1845
1846         table = fib6_get_table(net, RT6_TABLE_INFO);
1847         if (!table)
1848                 return NULL;
1849
1850         write_lock_bh(&table->tb6_lock);
1851         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1852         if (!fn)
1853                 goto out;
1854
1855         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1856                 if (rt->dst.dev->ifindex != ifindex)
1857                         continue;
1858                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1859                         continue;
1860                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1861                         continue;
1862                 dst_hold(&rt->dst);
1863                 break;
1864         }
1865 out:
1866         write_unlock_bh(&table->tb6_lock);
1867         return rt;
1868 }
1869
1870 static struct rt6_info *rt6_add_route_info(struct net *net,
1871                                            const struct in6_addr *prefix, int prefixlen,
1872                                            const struct in6_addr *gwaddr, int ifindex,
1873                                            unsigned pref)
1874 {
1875         struct fib6_config cfg = {
1876                 .fc_table       = RT6_TABLE_INFO,
1877                 .fc_metric      = IP6_RT_PRIO_USER,
1878                 .fc_ifindex     = ifindex,
1879                 .fc_dst_len     = prefixlen,
1880                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1881                                   RTF_UP | RTF_PREF(pref),
1882                 .fc_nlinfo.pid = 0,
1883                 .fc_nlinfo.nlh = NULL,
1884                 .fc_nlinfo.nl_net = net,
1885         };
1886
1887         cfg.fc_dst = *prefix;
1888         cfg.fc_gateway = *gwaddr;
1889
1890         /* We should treat it as a default route if prefix length is 0. */
1891         if (!prefixlen)
1892                 cfg.fc_flags |= RTF_DEFAULT;
1893
1894         ip6_route_add(&cfg);
1895
1896         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1897 }
1898 #endif
1899
1900 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1901 {
1902         struct rt6_info *rt;
1903         struct fib6_table *table;
1904
1905         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1906         if (!table)
1907                 return NULL;
1908
1909         write_lock_bh(&table->tb6_lock);
1910         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1911                 if (dev == rt->dst.dev &&
1912                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1913                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1914                         break;
1915         }
1916         if (rt)
1917                 dst_hold(&rt->dst);
1918         write_unlock_bh(&table->tb6_lock);
1919         return rt;
1920 }
1921
1922 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1923                                      struct net_device *dev,
1924                                      unsigned int pref)
1925 {
1926         struct fib6_config cfg = {
1927                 .fc_table       = RT6_TABLE_DFLT,
1928                 .fc_metric      = IP6_RT_PRIO_USER,
1929                 .fc_ifindex     = dev->ifindex,
1930                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1931                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1932                 .fc_nlinfo.pid = 0,
1933                 .fc_nlinfo.nlh = NULL,
1934                 .fc_nlinfo.nl_net = dev_net(dev),
1935         };
1936
1937         cfg.fc_gateway = *gwaddr;
1938
1939         ip6_route_add(&cfg);
1940
1941         return rt6_get_dflt_router(gwaddr, dev);
1942 }
1943
1944 void rt6_purge_dflt_routers(struct net *net)
1945 {
1946         struct rt6_info *rt;
1947         struct fib6_table *table;
1948
1949         /* NOTE: Keep consistent with rt6_get_dflt_router */
1950         table = fib6_get_table(net, RT6_TABLE_DFLT);
1951         if (!table)
1952                 return;
1953
1954 restart:
1955         read_lock_bh(&table->tb6_lock);
1956         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1957                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1958                         dst_hold(&rt->dst);
1959                         read_unlock_bh(&table->tb6_lock);
1960                         ip6_del_rt(rt);
1961                         goto restart;
1962                 }
1963         }
1964         read_unlock_bh(&table->tb6_lock);
1965 }
1966
1967 static void rtmsg_to_fib6_config(struct net *net,
1968                                  struct in6_rtmsg *rtmsg,
1969                                  struct fib6_config *cfg)
1970 {
1971         memset(cfg, 0, sizeof(*cfg));
1972
1973         cfg->fc_table = RT6_TABLE_MAIN;
1974         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1975         cfg->fc_metric = rtmsg->rtmsg_metric;
1976         cfg->fc_expires = rtmsg->rtmsg_info;
1977         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1978         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1979         cfg->fc_flags = rtmsg->rtmsg_flags;
1980
1981         cfg->fc_nlinfo.nl_net = net;
1982
1983         cfg->fc_dst = rtmsg->rtmsg_dst;
1984         cfg->fc_src = rtmsg->rtmsg_src;
1985         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1986 }
1987
1988 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1989 {
1990         struct fib6_config cfg;
1991         struct in6_rtmsg rtmsg;
1992         int err;
1993
1994         switch(cmd) {
1995         case SIOCADDRT:         /* Add a route */
1996         case SIOCDELRT:         /* Delete a route */
1997                 if (!capable(CAP_NET_ADMIN))
1998                         return -EPERM;
1999                 err = copy_from_user(&rtmsg, arg,
2000                                      sizeof(struct in6_rtmsg));
2001                 if (err)
2002                         return -EFAULT;
2003
2004                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2005
2006                 rtnl_lock();
2007                 switch (cmd) {
2008                 case SIOCADDRT:
2009                         err = ip6_route_add(&cfg);
2010                         break;
2011                 case SIOCDELRT:
2012                         err = ip6_route_del(&cfg);
2013                         break;
2014                 default:
2015                         err = -EINVAL;
2016                 }
2017                 rtnl_unlock();
2018
2019                 return err;
2020         }
2021
2022         return -EINVAL;
2023 }
2024
2025 /*
2026  *      Drop the packet on the floor
2027  */
2028
2029 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2030 {
2031         int type;
2032         struct dst_entry *dst = skb_dst(skb);
2033         switch (ipstats_mib_noroutes) {
2034         case IPSTATS_MIB_INNOROUTES:
2035                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2036                 if (type == IPV6_ADDR_ANY) {
2037                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2038                                       IPSTATS_MIB_INADDRERRORS);
2039                         break;
2040                 }
2041                 /* FALLTHROUGH */
2042         case IPSTATS_MIB_OUTNOROUTES:
2043                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2044                               ipstats_mib_noroutes);
2045                 break;
2046         }
2047         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2048         kfree_skb(skb);
2049         return 0;
2050 }
2051
2052 static int ip6_pkt_discard(struct sk_buff *skb)
2053 {
2054         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2055 }
2056
2057 static int ip6_pkt_discard_out(struct sk_buff *skb)
2058 {
2059         skb->dev = skb_dst(skb)->dev;
2060         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2061 }
2062
2063 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2064
2065 static int ip6_pkt_prohibit(struct sk_buff *skb)
2066 {
2067         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2068 }
2069
2070 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2071 {
2072         skb->dev = skb_dst(skb)->dev;
2073         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2074 }
2075
2076 #endif
2077
2078 /*
2079  *      Allocate a dst for local (unicast / anycast) address.
2080  */
2081
2082 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2083                                     const struct in6_addr *addr,
2084                                     bool anycast)
2085 {
2086         struct net *net = dev_net(idev->dev);
2087         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2088                                             net->loopback_dev, 0);
2089         int err;
2090
2091         if (!rt) {
2092                 if (net_ratelimit())
2093                         pr_warning("IPv6:  Maximum number of routes reached,"
2094                                    " consider increasing route/max_size.\n");
2095                 return ERR_PTR(-ENOMEM);
2096         }
2097
2098         in6_dev_hold(idev);
2099
2100         rt->dst.flags |= DST_HOST;
2101         rt->dst.input = ip6_input;
2102         rt->dst.output = ip6_output;
2103         rt->rt6i_idev = idev;
2104         rt->dst.obsolete = -1;
2105
2106         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2107         if (anycast)
2108                 rt->rt6i_flags |= RTF_ANYCAST;
2109         else
2110                 rt->rt6i_flags |= RTF_LOCAL;
2111         err = rt6_bind_neighbour(rt, rt->dst.dev);
2112         if (err) {
2113                 dst_free(&rt->dst);
2114                 return ERR_PTR(err);
2115         }
2116
2117         rt->rt6i_dst.addr = *addr;
2118         rt->rt6i_dst.plen = 128;
2119         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2120
2121         atomic_set(&rt->dst.__refcnt, 1);
2122
2123         return rt;
2124 }
2125
2126 int ip6_route_get_saddr(struct net *net,
2127                         struct rt6_info *rt,
2128                         const struct in6_addr *daddr,
2129                         unsigned int prefs,
2130                         struct in6_addr *saddr)
2131 {
2132         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2133         int err = 0;
2134         if (rt->rt6i_prefsrc.plen)
2135                 *saddr = rt->rt6i_prefsrc.addr;
2136         else
2137                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2138                                          daddr, prefs, saddr);
2139         return err;
2140 }
2141
2142 /* remove deleted ip from prefsrc entries */
2143 struct arg_dev_net_ip {
2144         struct net_device *dev;
2145         struct net *net;
2146         struct in6_addr *addr;
2147 };
2148
2149 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2150 {
2151         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2152         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2153         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2154
2155         if (((void *)rt->dst.dev == dev || !dev) &&
2156             rt != net->ipv6.ip6_null_entry &&
2157             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2158                 /* remove prefsrc entry */
2159                 rt->rt6i_prefsrc.plen = 0;
2160         }
2161         return 0;
2162 }
2163
2164 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2165 {
2166         struct net *net = dev_net(ifp->idev->dev);
2167         struct arg_dev_net_ip adni = {
2168                 .dev = ifp->idev->dev,
2169                 .net = net,
2170                 .addr = &ifp->addr,
2171         };
2172         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2173 }
2174
2175 struct arg_dev_net {
2176         struct net_device *dev;
2177         struct net *net;
2178 };
2179
2180 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2181 {
2182         const struct arg_dev_net *adn = arg;
2183         const struct net_device *dev = adn->dev;
2184
2185         if ((rt->dst.dev == dev || !dev) &&
2186             rt != adn->net->ipv6.ip6_null_entry)
2187                 return -1;
2188
2189         return 0;
2190 }
2191
2192 void rt6_ifdown(struct net *net, struct net_device *dev)
2193 {
2194         struct arg_dev_net adn = {
2195                 .dev = dev,
2196                 .net = net,
2197         };
2198
2199         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2200         icmp6_clean_all(fib6_ifdown, &adn);
2201 }
2202
2203 struct rt6_mtu_change_arg
2204 {
2205         struct net_device *dev;
2206         unsigned mtu;
2207 };
2208
2209 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2210 {
2211         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2212         struct inet6_dev *idev;
2213
2214         /* In IPv6 pmtu discovery is not optional,
2215            so that RTAX_MTU lock cannot disable it.
2216            We still use this lock to block changes
2217            caused by addrconf/ndisc.
2218         */
2219
2220         idev = __in6_dev_get(arg->dev);
2221         if (!idev)
2222                 return 0;
2223
2224         /* For administrative MTU increase, there is no way to discover
2225            IPv6 PMTU increase, so PMTU increase should be updated here.
2226            Since RFC 1981 doesn't include administrative MTU increase
2227            update PMTU increase is a MUST. (i.e. jumbo frame)
2228          */
2229         /*
2230            If new MTU is less than route PMTU, this new MTU will be the
2231            lowest MTU in the path, update the route PMTU to reflect PMTU
2232            decreases; if new MTU is greater than route PMTU, and the
2233            old MTU is the lowest MTU in the path, update the route PMTU
2234            to reflect the increase. In this case if the other nodes' MTU
2235            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2236            PMTU discouvery.
2237          */
2238         if (rt->dst.dev == arg->dev &&
2239             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2240             (dst_mtu(&rt->dst) >= arg->mtu ||
2241              (dst_mtu(&rt->dst) < arg->mtu &&
2242               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2243                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2244         }
2245         return 0;
2246 }
2247
2248 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2249 {
2250         struct rt6_mtu_change_arg arg = {
2251                 .dev = dev,
2252                 .mtu = mtu,
2253         };
2254
2255         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2256 }
2257
2258 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2259         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2260         [RTA_OIF]               = { .type = NLA_U32 },
2261         [RTA_IIF]               = { .type = NLA_U32 },
2262         [RTA_PRIORITY]          = { .type = NLA_U32 },
2263         [RTA_METRICS]           = { .type = NLA_NESTED },
2264 };
2265
2266 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2267                               struct fib6_config *cfg)
2268 {
2269         struct rtmsg *rtm;
2270         struct nlattr *tb[RTA_MAX+1];
2271         int err;
2272
2273         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2274         if (err < 0)
2275                 goto errout;
2276
2277         err = -EINVAL;
2278         rtm = nlmsg_data(nlh);
2279         memset(cfg, 0, sizeof(*cfg));
2280
2281         cfg->fc_table = rtm->rtm_table;
2282         cfg->fc_dst_len = rtm->rtm_dst_len;
2283         cfg->fc_src_len = rtm->rtm_src_len;
2284         cfg->fc_flags = RTF_UP;
2285         cfg->fc_protocol = rtm->rtm_protocol;
2286
2287         if (rtm->rtm_type == RTN_UNREACHABLE)
2288                 cfg->fc_flags |= RTF_REJECT;
2289
2290         if (rtm->rtm_type == RTN_LOCAL)
2291                 cfg->fc_flags |= RTF_LOCAL;
2292
2293         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2294         cfg->fc_nlinfo.nlh = nlh;
2295         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2296
2297         if (tb[RTA_GATEWAY]) {
2298                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2299                 cfg->fc_flags |= RTF_GATEWAY;
2300         }
2301
2302         if (tb[RTA_DST]) {
2303                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2304
2305                 if (nla_len(tb[RTA_DST]) < plen)
2306                         goto errout;
2307
2308                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2309         }
2310
2311         if (tb[RTA_SRC]) {
2312                 int plen = (rtm->rtm_src_len + 7) >> 3;
2313
2314                 if (nla_len(tb[RTA_SRC]) < plen)
2315                         goto errout;
2316
2317                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2318         }
2319
2320         if (tb[RTA_PREFSRC])
2321                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2322
2323         if (tb[RTA_OIF])
2324                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2325
2326         if (tb[RTA_PRIORITY])
2327                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2328
2329         if (tb[RTA_METRICS]) {
2330                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2331                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2332         }
2333
2334         if (tb[RTA_TABLE])
2335                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2336
2337         err = 0;
2338 errout:
2339         return err;
2340 }
2341
2342 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2343 {
2344         struct fib6_config cfg;
2345         int err;
2346
2347         err = rtm_to_fib6_config(skb, nlh, &cfg);
2348         if (err < 0)
2349                 return err;
2350
2351         return ip6_route_del(&cfg);
2352 }
2353
2354 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2355 {
2356         struct fib6_config cfg;
2357         int err;
2358
2359         err = rtm_to_fib6_config(skb, nlh, &cfg);
2360         if (err < 0)
2361                 return err;
2362
2363         return ip6_route_add(&cfg);
2364 }
2365
2366 static inline size_t rt6_nlmsg_size(void)
2367 {
2368         return NLMSG_ALIGN(sizeof(struct rtmsg))
2369                + nla_total_size(16) /* RTA_SRC */
2370                + nla_total_size(16) /* RTA_DST */
2371                + nla_total_size(16) /* RTA_GATEWAY */
2372                + nla_total_size(16) /* RTA_PREFSRC */
2373                + nla_total_size(4) /* RTA_TABLE */
2374                + nla_total_size(4) /* RTA_IIF */
2375                + nla_total_size(4) /* RTA_OIF */
2376                + nla_total_size(4) /* RTA_PRIORITY */
2377                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2378                + nla_total_size(sizeof(struct rta_cacheinfo));
2379 }
2380
2381 static int rt6_fill_node(struct net *net,
2382                          struct sk_buff *skb, struct rt6_info *rt,
2383                          struct in6_addr *dst, struct in6_addr *src,
2384                          int iif, int type, u32 pid, u32 seq,
2385                          int prefix, int nowait, unsigned int flags)
2386 {
2387         const struct inet_peer *peer;
2388         struct rtmsg *rtm;
2389         struct nlmsghdr *nlh;
2390         long expires;
2391         u32 table;
2392         struct neighbour *n;
2393         u32 ts, tsage;
2394
2395         if (prefix) {   /* user wants prefix routes only */
2396                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2397                         /* success since this is not a prefix route */
2398                         return 1;
2399                 }
2400         }
2401
2402         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2403         if (!nlh)
2404                 return -EMSGSIZE;
2405
2406         rtm = nlmsg_data(nlh);
2407         rtm->rtm_family = AF_INET6;
2408         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2409         rtm->rtm_src_len = rt->rt6i_src.plen;
2410         rtm->rtm_tos = 0;
2411         if (rt->rt6i_table)
2412                 table = rt->rt6i_table->tb6_id;
2413         else
2414                 table = RT6_TABLE_UNSPEC;
2415         rtm->rtm_table = table;
2416         if (nla_put_u32(skb, RTA_TABLE, table))
2417                 goto nla_put_failure;
2418         if (rt->rt6i_flags & RTF_REJECT)
2419                 rtm->rtm_type = RTN_UNREACHABLE;
2420         else if (rt->rt6i_flags & RTF_LOCAL)
2421                 rtm->rtm_type = RTN_LOCAL;
2422         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2423                 rtm->rtm_type = RTN_LOCAL;
2424         else
2425                 rtm->rtm_type = RTN_UNICAST;
2426         rtm->rtm_flags = 0;
2427         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2428         rtm->rtm_protocol = rt->rt6i_protocol;
2429         if (rt->rt6i_flags & RTF_DYNAMIC)
2430                 rtm->rtm_protocol = RTPROT_REDIRECT;
2431         else if (rt->rt6i_flags & RTF_ADDRCONF)
2432                 rtm->rtm_protocol = RTPROT_KERNEL;
2433         else if (rt->rt6i_flags & RTF_DEFAULT)
2434                 rtm->rtm_protocol = RTPROT_RA;
2435
2436         if (rt->rt6i_flags & RTF_CACHE)
2437                 rtm->rtm_flags |= RTM_F_CLONED;
2438
2439         if (dst) {
2440                 if (nla_put(skb, RTA_DST, 16, dst))
2441                         goto nla_put_failure;
2442                 rtm->rtm_dst_len = 128;
2443         } else if (rtm->rtm_dst_len)
2444                 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2445                         goto nla_put_failure;
2446 #ifdef CONFIG_IPV6_SUBTREES
2447         if (src) {
2448                 if (nla_put(skb, RTA_SRC, 16, src))
2449                         goto nla_put_failure;
2450                 rtm->rtm_src_len = 128;
2451         } else if (rtm->rtm_src_len &&
2452                    nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2453                 goto nla_put_failure;
2454 #endif
2455         if (iif) {
2456 #ifdef CONFIG_IPV6_MROUTE
2457                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2458                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2459                         if (err <= 0) {
2460                                 if (!nowait) {
2461                                         if (err == 0)
2462                                                 return 0;
2463                                         goto nla_put_failure;
2464                                 } else {
2465                                         if (err == -EMSGSIZE)
2466                                                 goto nla_put_failure;
2467                                 }
2468                         }
2469                 } else
2470 #endif
2471                         if (nla_put_u32(skb, RTA_IIF, iif))
2472                                 goto nla_put_failure;
2473         } else if (dst) {
2474                 struct in6_addr saddr_buf;
2475                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2476                     nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2477                         goto nla_put_failure;
2478         }
2479
2480         if (rt->rt6i_prefsrc.plen) {
2481                 struct in6_addr saddr_buf;
2482                 saddr_buf = rt->rt6i_prefsrc.addr;
2483                 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2484                         goto nla_put_failure;
2485         }
2486
2487         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2488                 goto nla_put_failure;
2489
2490         rcu_read_lock();
2491         n = dst_get_neighbour_noref(&rt->dst);
2492         if (n) {
2493                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2494                         rcu_read_unlock();
2495                         goto nla_put_failure;
2496                 }
2497         }
2498         rcu_read_unlock();
2499
2500         if (rt->dst.dev &&
2501             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2502                 goto nla_put_failure;
2503         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2504                 goto nla_put_failure;
2505         if (!(rt->rt6i_flags & RTF_EXPIRES))
2506                 expires = 0;
2507         else if (rt->dst.expires - jiffies < INT_MAX)
2508                 expires = rt->dst.expires - jiffies;
2509         else
2510                 expires = INT_MAX;
2511
2512         peer = rt->rt6i_peer;
2513         ts = tsage = 0;
2514         if (peer && peer->tcp_ts_stamp) {
2515                 ts = peer->tcp_ts;
2516                 tsage = get_seconds() - peer->tcp_ts_stamp;
2517         }
2518
2519         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2520                                expires, rt->dst.error) < 0)
2521                 goto nla_put_failure;
2522
2523         return nlmsg_end(skb, nlh);
2524
2525 nla_put_failure:
2526         nlmsg_cancel(skb, nlh);
2527         return -EMSGSIZE;
2528 }
2529
2530 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2531 {
2532         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2533         int prefix;
2534
2535         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2536                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2537                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2538         } else
2539                 prefix = 0;
2540
2541         return rt6_fill_node(arg->net,
2542                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2543                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2544                      prefix, 0, NLM_F_MULTI);
2545 }
2546
2547 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2548 {
2549         struct net *net = sock_net(in_skb->sk);
2550         struct nlattr *tb[RTA_MAX+1];
2551         struct rt6_info *rt;
2552         struct sk_buff *skb;
2553         struct rtmsg *rtm;
2554         struct flowi6 fl6;
2555         int err, iif = 0, oif = 0;
2556
2557         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2558         if (err < 0)
2559                 goto errout;
2560
2561         err = -EINVAL;
2562         memset(&fl6, 0, sizeof(fl6));
2563
2564         if (tb[RTA_SRC]) {
2565                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2566                         goto errout;
2567
2568                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2569         }
2570
2571         if (tb[RTA_DST]) {
2572                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2573                         goto errout;
2574
2575                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2576         }
2577
2578         if (tb[RTA_IIF])
2579                 iif = nla_get_u32(tb[RTA_IIF]);
2580
2581         if (tb[RTA_OIF])
2582                 oif = nla_get_u32(tb[RTA_OIF]);
2583
2584         if (iif) {
2585                 struct net_device *dev;
2586                 int flags = 0;
2587
2588                 dev = __dev_get_by_index(net, iif);
2589                 if (!dev) {
2590                         err = -ENODEV;
2591                         goto errout;
2592                 }
2593
2594                 fl6.flowi6_iif = iif;
2595
2596                 if (!ipv6_addr_any(&fl6.saddr))
2597                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2598
2599                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2600                                                                flags);
2601         } else {
2602                 fl6.flowi6_oif = oif;
2603
2604                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2605         }
2606
2607         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2608         if (!skb) {
2609                 dst_release(&rt->dst);
2610                 err = -ENOBUFS;
2611                 goto errout;
2612         }
2613
2614         /* Reserve room for dummy headers, this skb can pass
2615            through good chunk of routing engine.
2616          */
2617         skb_reset_mac_header(skb);
2618         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2619
2620         skb_dst_set(skb, &rt->dst);
2621
2622         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2623                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2624                             nlh->nlmsg_seq, 0, 0, 0);
2625         if (err < 0) {
2626                 kfree_skb(skb);
2627                 goto errout;
2628         }
2629
2630         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2631 errout:
2632         return err;
2633 }
2634
2635 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2636 {
2637         struct sk_buff *skb;
2638         struct net *net = info->nl_net;
2639         u32 seq;
2640         int err;
2641
2642         err = -ENOBUFS;
2643         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2644
2645         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2646         if (!skb)
2647                 goto errout;
2648
2649         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2650                                 event, info->pid, seq, 0, 0, 0);
2651         if (err < 0) {
2652                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2653                 WARN_ON(err == -EMSGSIZE);
2654                 kfree_skb(skb);
2655                 goto errout;
2656         }
2657         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2658                     info->nlh, gfp_any());
2659         return;
2660 errout:
2661         if (err < 0)
2662                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2663 }
2664
2665 static int ip6_route_dev_notify(struct notifier_block *this,
2666                                 unsigned long event, void *data)
2667 {
2668         struct net_device *dev = (struct net_device *)data;
2669         struct net *net = dev_net(dev);
2670
2671         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2672                 net->ipv6.ip6_null_entry->dst.dev = dev;
2673                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2674 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2675                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2676                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2677                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2678                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2679 #endif
2680         }
2681
2682         return NOTIFY_OK;
2683 }
2684
2685 /*
2686  *      /proc
2687  */
2688
2689 #ifdef CONFIG_PROC_FS
2690
2691 struct rt6_proc_arg
2692 {
2693         char *buffer;
2694         int offset;
2695         int length;
2696         int skip;
2697         int len;
2698 };
2699
2700 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2701 {
2702         struct seq_file *m = p_arg;
2703         struct neighbour *n;
2704
2705         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2706
2707 #ifdef CONFIG_IPV6_SUBTREES
2708         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2709 #else
2710         seq_puts(m, "00000000000000000000000000000000 00 ");
2711 #endif
2712         rcu_read_lock();
2713         n = dst_get_neighbour_noref(&rt->dst);
2714         if (n) {
2715                 seq_printf(m, "%pi6", n->primary_key);
2716         } else {
2717                 seq_puts(m, "00000000000000000000000000000000");
2718         }
2719         rcu_read_unlock();
2720         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2721                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2722                    rt->dst.__use, rt->rt6i_flags,
2723                    rt->dst.dev ? rt->dst.dev->name : "");
2724         return 0;
2725 }
2726
2727 static int ipv6_route_show(struct seq_file *m, void *v)
2728 {
2729         struct net *net = (struct net *)m->private;
2730         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2731         return 0;
2732 }
2733
2734 static int ipv6_route_open(struct inode *inode, struct file *file)
2735 {
2736         return single_open_net(inode, file, ipv6_route_show);
2737 }
2738
2739 static const struct file_operations ipv6_route_proc_fops = {
2740         .owner          = THIS_MODULE,
2741         .open           = ipv6_route_open,
2742         .read           = seq_read,
2743         .llseek         = seq_lseek,
2744         .release        = single_release_net,
2745 };
2746
2747 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2748 {
2749         struct net *net = (struct net *)seq->private;
2750         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2751                    net->ipv6.rt6_stats->fib_nodes,
2752                    net->ipv6.rt6_stats->fib_route_nodes,
2753                    net->ipv6.rt6_stats->fib_rt_alloc,
2754                    net->ipv6.rt6_stats->fib_rt_entries,
2755                    net->ipv6.rt6_stats->fib_rt_cache,
2756                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2757                    net->ipv6.rt6_stats->fib_discarded_routes);
2758
2759         return 0;
2760 }
2761
2762 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2763 {
2764         return single_open_net(inode, file, rt6_stats_seq_show);
2765 }
2766
2767 static const struct file_operations rt6_stats_seq_fops = {
2768         .owner   = THIS_MODULE,
2769         .open    = rt6_stats_seq_open,
2770         .read    = seq_read,
2771         .llseek  = seq_lseek,
2772         .release = single_release_net,
2773 };
2774 #endif  /* CONFIG_PROC_FS */
2775
2776 #ifdef CONFIG_SYSCTL
2777
2778 static
2779 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2780                               void __user *buffer, size_t *lenp, loff_t *ppos)
2781 {
2782         struct net *net;
2783         int delay;
2784         if (!write)
2785                 return -EINVAL;
2786
2787         net = (struct net *)ctl->extra1;
2788         delay = net->ipv6.sysctl.flush_delay;
2789         proc_dointvec(ctl, write, buffer, lenp, ppos);
2790         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2791         return 0;
2792 }
2793
2794 ctl_table ipv6_route_table_template[] = {
2795         {
2796                 .procname       =       "flush",
2797                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2798                 .maxlen         =       sizeof(int),
2799                 .mode           =       0200,
2800                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2801         },
2802         {
2803                 .procname       =       "gc_thresh",
2804                 .data           =       &ip6_dst_ops_template.gc_thresh,
2805                 .maxlen         =       sizeof(int),
2806                 .mode           =       0644,
2807                 .proc_handler   =       proc_dointvec,
2808         },
2809         {
2810                 .procname       =       "max_size",
2811                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2812                 .maxlen         =       sizeof(int),
2813                 .mode           =       0644,
2814                 .proc_handler   =       proc_dointvec,
2815         },
2816         {
2817                 .procname       =       "gc_min_interval",
2818                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2819                 .maxlen         =       sizeof(int),
2820                 .mode           =       0644,
2821                 .proc_handler   =       proc_dointvec_jiffies,
2822         },
2823         {
2824                 .procname       =       "gc_timeout",
2825                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2826                 .maxlen         =       sizeof(int),
2827                 .mode           =       0644,
2828                 .proc_handler   =       proc_dointvec_jiffies,
2829         },
2830         {
2831                 .procname       =       "gc_interval",
2832                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2833                 .maxlen         =       sizeof(int),
2834                 .mode           =       0644,
2835                 .proc_handler   =       proc_dointvec_jiffies,
2836         },
2837         {
2838                 .procname       =       "gc_elasticity",
2839                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2840                 .maxlen         =       sizeof(int),
2841                 .mode           =       0644,
2842                 .proc_handler   =       proc_dointvec,
2843         },
2844         {
2845                 .procname       =       "mtu_expires",
2846                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2847                 .maxlen         =       sizeof(int),
2848                 .mode           =       0644,
2849                 .proc_handler   =       proc_dointvec_jiffies,
2850         },
2851         {
2852                 .procname       =       "min_adv_mss",
2853                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2854                 .maxlen         =       sizeof(int),
2855                 .mode           =       0644,
2856                 .proc_handler   =       proc_dointvec,
2857         },
2858         {
2859                 .procname       =       "gc_min_interval_ms",
2860                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2861                 .maxlen         =       sizeof(int),
2862                 .mode           =       0644,
2863                 .proc_handler   =       proc_dointvec_ms_jiffies,
2864         },
2865         { }
2866 };
2867
2868 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2869 {
2870         struct ctl_table *table;
2871
2872         table = kmemdup(ipv6_route_table_template,
2873                         sizeof(ipv6_route_table_template),
2874                         GFP_KERNEL);
2875
2876         if (table) {
2877                 table[0].data = &net->ipv6.sysctl.flush_delay;
2878                 table[0].extra1 = net;
2879                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2880                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2881                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2882                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2883                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2884                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2885                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2886                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2887                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2888         }
2889
2890         return table;
2891 }
2892 #endif
2893
2894 static int __net_init ip6_route_net_init(struct net *net)
2895 {
2896         int ret = -ENOMEM;
2897
2898         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2899                sizeof(net->ipv6.ip6_dst_ops));
2900
2901         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2902                 goto out_ip6_dst_ops;
2903
2904         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2905                                            sizeof(*net->ipv6.ip6_null_entry),
2906                                            GFP_KERNEL);
2907         if (!net->ipv6.ip6_null_entry)
2908                 goto out_ip6_dst_entries;
2909         net->ipv6.ip6_null_entry->dst.path =
2910                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2911         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2912         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2913                          ip6_template_metrics, true);
2914
2915 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2916         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2917                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2918                                                GFP_KERNEL);
2919         if (!net->ipv6.ip6_prohibit_entry)
2920                 goto out_ip6_null_entry;
2921         net->ipv6.ip6_prohibit_entry->dst.path =
2922                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2923         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2924         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2925                          ip6_template_metrics, true);
2926
2927         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2928                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2929                                                GFP_KERNEL);
2930         if (!net->ipv6.ip6_blk_hole_entry)
2931                 goto out_ip6_prohibit_entry;
2932         net->ipv6.ip6_blk_hole_entry->dst.path =
2933                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2934         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2935         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2936                          ip6_template_metrics, true);
2937 #endif
2938
2939         net->ipv6.sysctl.flush_delay = 0;
2940         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2941         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2942         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2943         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2944         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2945         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2946         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2947
2948 #ifdef CONFIG_PROC_FS
2949         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2950         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2951 #endif
2952         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2953
2954         ret = 0;
2955 out:
2956         return ret;
2957
2958 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2959 out_ip6_prohibit_entry:
2960         kfree(net->ipv6.ip6_prohibit_entry);
2961 out_ip6_null_entry:
2962         kfree(net->ipv6.ip6_null_entry);
2963 #endif
2964 out_ip6_dst_entries:
2965         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2966 out_ip6_dst_ops:
2967         goto out;
2968 }
2969
2970 static void __net_exit ip6_route_net_exit(struct net *net)
2971 {
2972 #ifdef CONFIG_PROC_FS
2973         proc_net_remove(net, "ipv6_route");
2974         proc_net_remove(net, "rt6_stats");
2975 #endif
2976         kfree(net->ipv6.ip6_null_entry);
2977 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2978         kfree(net->ipv6.ip6_prohibit_entry);
2979         kfree(net->ipv6.ip6_blk_hole_entry);
2980 #endif
2981         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2982 }
2983
2984 static struct pernet_operations ip6_route_net_ops = {
2985         .init = ip6_route_net_init,
2986         .exit = ip6_route_net_exit,
2987 };
2988
2989 static struct notifier_block ip6_route_dev_notifier = {
2990         .notifier_call = ip6_route_dev_notify,
2991         .priority = 0,
2992 };
2993
2994 int __init ip6_route_init(void)
2995 {
2996         int ret;
2997
2998         ret = -ENOMEM;
2999         ip6_dst_ops_template.kmem_cachep =
3000                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3001                                   SLAB_HWCACHE_ALIGN, NULL);
3002         if (!ip6_dst_ops_template.kmem_cachep)
3003                 goto out;
3004
3005         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3006         if (ret)
3007                 goto out_kmem_cache;
3008
3009         ret = register_pernet_subsys(&ip6_route_net_ops);
3010         if (ret)
3011                 goto out_dst_entries;
3012
3013         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3014
3015         /* Registering of the loopback is done before this portion of code,
3016          * the loopback reference in rt6_info will not be taken, do it
3017          * manually for init_net */
3018         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3019         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3020   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3021         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3022         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3023         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3024         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3025   #endif
3026         ret = fib6_init();
3027         if (ret)
3028                 goto out_register_subsys;
3029
3030         ret = xfrm6_init();
3031         if (ret)
3032                 goto out_fib6_init;
3033
3034         ret = fib6_rules_init();
3035         if (ret)
3036                 goto xfrm6_init;
3037
3038         ret = -ENOBUFS;
3039         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3040             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3041             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3042                 goto fib6_rules_init;
3043
3044         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3045         if (ret)
3046                 goto fib6_rules_init;
3047
3048 out:
3049         return ret;
3050
3051 fib6_rules_init:
3052         fib6_rules_cleanup();
3053 xfrm6_init:
3054         xfrm6_fini();
3055 out_fib6_init:
3056         fib6_gc_cleanup();
3057 out_register_subsys:
3058         unregister_pernet_subsys(&ip6_route_net_ops);
3059 out_dst_entries:
3060         dst_entries_destroy(&ip6_dst_blackhole_ops);
3061 out_kmem_cache:
3062         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3063         goto out;
3064 }
3065
3066 void ip6_route_cleanup(void)
3067 {
3068         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3069         fib6_rules_cleanup();
3070         xfrm6_fini();
3071         fib6_gc_cleanup();
3072         unregister_pernet_subsys(&ip6_route_net_ops);
3073         dst_entries_destroy(&ip6_dst_blackhole_ops);
3074         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3075 }