net: Add optional SKB arg to dst_ops->neigh_lookup().
[firefly-linux-kernel-4.4.55.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60
61 #include <asm/uaccess.h>
62
63 #ifdef CONFIG_SYSCTL
64 #include <linux/sysctl.h>
65 #endif
66
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68                                     const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int      ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void             ip6_dst_destroy(struct dst_entry *);
74 static void             ip6_dst_ifdown(struct dst_entry *,
75                                        struct net_device *dev, int how);
76 static int               ip6_dst_gc(struct dst_ops *ops);
77
78 static int              ip6_pkt_discard(struct sk_buff *skb);
79 static int              ip6_pkt_discard_out(struct sk_buff *skb);
80 static void             ip6_link_failure(struct sk_buff *skb);
81 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
82
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85                                            const struct in6_addr *prefix, int prefixlen,
86                                            const struct in6_addr *gwaddr, int ifindex,
87                                            unsigned int pref);
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89                                            const struct in6_addr *prefix, int prefixlen,
90                                            const struct in6_addr *gwaddr, int ifindex);
91 #endif
92
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
94 {
95         struct rt6_info *rt = (struct rt6_info *) dst;
96         struct inet_peer *peer;
97         u32 *p = NULL;
98
99         if (!(rt->dst.flags & DST_HOST))
100                 return NULL;
101
102         peer = rt6_get_peer_create(rt);
103         if (peer) {
104                 u32 *old_p = __DST_METRICS_PTR(old);
105                 unsigned long prev, new;
106
107                 p = peer->metrics;
108                 if (inet_metrics_new(peer))
109                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
110
111                 new = (unsigned long) p;
112                 prev = cmpxchg(&dst->_metrics, old, new);
113
114                 if (prev != old) {
115                         p = __DST_METRICS_PTR(prev);
116                         if (prev & DST_METRICS_READ_ONLY)
117                                 p = NULL;
118                 }
119         }
120         return p;
121 }
122
123 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
124                                              struct sk_buff *skb,
125                                              const void *daddr)
126 {
127         struct in6_addr *p = &rt->rt6i_gateway;
128
129         if (!ipv6_addr_any(p))
130                 return (const void *) p;
131         else if (skb)
132                 return &ipv6_hdr(skb)->daddr;
133         return daddr;
134 }
135
136 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
137                                           struct sk_buff *skb,
138                                           const void *daddr)
139 {
140         struct rt6_info *rt = (struct rt6_info *) dst;
141         struct neighbour *n;
142
143         daddr = choose_neigh_daddr(rt, skb, daddr);
144         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
145         if (n)
146                 return n;
147         return neigh_create(&nd_tbl, daddr, dst->dev);
148 }
149
150 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
151 {
152         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
153         if (!n) {
154                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
155                 if (IS_ERR(n))
156                         return PTR_ERR(n);
157         }
158         dst_set_neighbour(&rt->dst, n);
159
160         return 0;
161 }
162
163 static struct dst_ops ip6_dst_ops_template = {
164         .family                 =       AF_INET6,
165         .protocol               =       cpu_to_be16(ETH_P_IPV6),
166         .gc                     =       ip6_dst_gc,
167         .gc_thresh              =       1024,
168         .check                  =       ip6_dst_check,
169         .default_advmss         =       ip6_default_advmss,
170         .mtu                    =       ip6_mtu,
171         .cow_metrics            =       ipv6_cow_metrics,
172         .destroy                =       ip6_dst_destroy,
173         .ifdown                 =       ip6_dst_ifdown,
174         .negative_advice        =       ip6_negative_advice,
175         .link_failure           =       ip6_link_failure,
176         .update_pmtu            =       ip6_rt_update_pmtu,
177         .local_out              =       __ip6_local_out,
178         .neigh_lookup           =       ip6_neigh_lookup,
179 };
180
181 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
182 {
183         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
184
185         return mtu ? : dst->dev->mtu;
186 }
187
188 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
189 {
190 }
191
192 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
193                                          unsigned long old)
194 {
195         return NULL;
196 }
197
198 static struct dst_ops ip6_dst_blackhole_ops = {
199         .family                 =       AF_INET6,
200         .protocol               =       cpu_to_be16(ETH_P_IPV6),
201         .destroy                =       ip6_dst_destroy,
202         .check                  =       ip6_dst_check,
203         .mtu                    =       ip6_blackhole_mtu,
204         .default_advmss         =       ip6_default_advmss,
205         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
206         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
207         .neigh_lookup           =       ip6_neigh_lookup,
208 };
209
210 static const u32 ip6_template_metrics[RTAX_MAX] = {
211         [RTAX_HOPLIMIT - 1] = 255,
212 };
213
214 static struct rt6_info ip6_null_entry_template = {
215         .dst = {
216                 .__refcnt       = ATOMIC_INIT(1),
217                 .__use          = 1,
218                 .obsolete       = -1,
219                 .error          = -ENETUNREACH,
220                 .input          = ip6_pkt_discard,
221                 .output         = ip6_pkt_discard_out,
222         },
223         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
224         .rt6i_protocol  = RTPROT_KERNEL,
225         .rt6i_metric    = ~(u32) 0,
226         .rt6i_ref       = ATOMIC_INIT(1),
227 };
228
229 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
230
231 static int ip6_pkt_prohibit(struct sk_buff *skb);
232 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
233
234 static struct rt6_info ip6_prohibit_entry_template = {
235         .dst = {
236                 .__refcnt       = ATOMIC_INIT(1),
237                 .__use          = 1,
238                 .obsolete       = -1,
239                 .error          = -EACCES,
240                 .input          = ip6_pkt_prohibit,
241                 .output         = ip6_pkt_prohibit_out,
242         },
243         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
244         .rt6i_protocol  = RTPROT_KERNEL,
245         .rt6i_metric    = ~(u32) 0,
246         .rt6i_ref       = ATOMIC_INIT(1),
247 };
248
249 static struct rt6_info ip6_blk_hole_entry_template = {
250         .dst = {
251                 .__refcnt       = ATOMIC_INIT(1),
252                 .__use          = 1,
253                 .obsolete       = -1,
254                 .error          = -EINVAL,
255                 .input          = dst_discard,
256                 .output         = dst_discard,
257         },
258         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
259         .rt6i_protocol  = RTPROT_KERNEL,
260         .rt6i_metric    = ~(u32) 0,
261         .rt6i_ref       = ATOMIC_INIT(1),
262 };
263
264 #endif
265
266 /* allocate dst with ip6_dst_ops */
267 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
268                                              struct net_device *dev,
269                                              int flags,
270                                              struct fib6_table *table)
271 {
272         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
273                                         0, 0, flags);
274
275         if (rt) {
276                 memset(&rt->rt6i_table, 0,
277                        sizeof(*rt) - sizeof(struct dst_entry));
278                 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
279         }
280         return rt;
281 }
282
283 static void ip6_dst_destroy(struct dst_entry *dst)
284 {
285         struct rt6_info *rt = (struct rt6_info *)dst;
286         struct inet6_dev *idev = rt->rt6i_idev;
287
288         if (!(rt->dst.flags & DST_HOST))
289                 dst_destroy_metrics_generic(dst);
290
291         if (idev) {
292                 rt->rt6i_idev = NULL;
293                 in6_dev_put(idev);
294         }
295
296         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
297                 dst_release(dst->from);
298
299         if (rt6_has_peer(rt)) {
300                 struct inet_peer *peer = rt6_peer_ptr(rt);
301                 inet_putpeer(peer);
302         }
303 }
304
305 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
306
307 static u32 rt6_peer_genid(void)
308 {
309         return atomic_read(&__rt6_peer_genid);
310 }
311
312 void rt6_bind_peer(struct rt6_info *rt, int create)
313 {
314         struct inet_peer_base *base;
315         struct inet_peer *peer;
316
317         base = inetpeer_base_ptr(rt->_rt6i_peer);
318         if (!base)
319                 return;
320
321         peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
322         if (peer) {
323                 if (!rt6_set_peer(rt, peer))
324                         inet_putpeer(peer);
325                 else
326                         rt->rt6i_peer_genid = rt6_peer_genid();
327         }
328 }
329
330 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
331                            int how)
332 {
333         struct rt6_info *rt = (struct rt6_info *)dst;
334         struct inet6_dev *idev = rt->rt6i_idev;
335         struct net_device *loopback_dev =
336                 dev_net(dev)->loopback_dev;
337
338         if (dev != loopback_dev && idev && idev->dev == dev) {
339                 struct inet6_dev *loopback_idev =
340                         in6_dev_get(loopback_dev);
341                 if (loopback_idev) {
342                         rt->rt6i_idev = loopback_idev;
343                         in6_dev_put(idev);
344                 }
345         }
346 }
347
348 static bool rt6_check_expired(const struct rt6_info *rt)
349 {
350         struct rt6_info *ort = NULL;
351
352         if (rt->rt6i_flags & RTF_EXPIRES) {
353                 if (time_after(jiffies, rt->dst.expires))
354                         return true;
355         } else if (rt->dst.from) {
356                 ort = (struct rt6_info *) rt->dst.from;
357                 return (ort->rt6i_flags & RTF_EXPIRES) &&
358                         time_after(jiffies, ort->dst.expires);
359         }
360         return false;
361 }
362
363 static bool rt6_need_strict(const struct in6_addr *daddr)
364 {
365         return ipv6_addr_type(daddr) &
366                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
367 }
368
369 /*
370  *      Route lookup. Any table->tb6_lock is implied.
371  */
372
373 static inline struct rt6_info *rt6_device_match(struct net *net,
374                                                     struct rt6_info *rt,
375                                                     const struct in6_addr *saddr,
376                                                     int oif,
377                                                     int flags)
378 {
379         struct rt6_info *local = NULL;
380         struct rt6_info *sprt;
381
382         if (!oif && ipv6_addr_any(saddr))
383                 goto out;
384
385         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
386                 struct net_device *dev = sprt->dst.dev;
387
388                 if (oif) {
389                         if (dev->ifindex == oif)
390                                 return sprt;
391                         if (dev->flags & IFF_LOOPBACK) {
392                                 if (!sprt->rt6i_idev ||
393                                     sprt->rt6i_idev->dev->ifindex != oif) {
394                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
395                                                 continue;
396                                         if (local && (!oif ||
397                                                       local->rt6i_idev->dev->ifindex == oif))
398                                                 continue;
399                                 }
400                                 local = sprt;
401                         }
402                 } else {
403                         if (ipv6_chk_addr(net, saddr, dev,
404                                           flags & RT6_LOOKUP_F_IFACE))
405                                 return sprt;
406                 }
407         }
408
409         if (oif) {
410                 if (local)
411                         return local;
412
413                 if (flags & RT6_LOOKUP_F_IFACE)
414                         return net->ipv6.ip6_null_entry;
415         }
416 out:
417         return rt;
418 }
419
420 #ifdef CONFIG_IPV6_ROUTER_PREF
421 static void rt6_probe(struct rt6_info *rt)
422 {
423         struct neighbour *neigh;
424         /*
425          * Okay, this does not seem to be appropriate
426          * for now, however, we need to check if it
427          * is really so; aka Router Reachability Probing.
428          *
429          * Router Reachability Probe MUST be rate-limited
430          * to no more than one per minute.
431          */
432         rcu_read_lock();
433         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
434         if (!neigh || (neigh->nud_state & NUD_VALID))
435                 goto out;
436         read_lock_bh(&neigh->lock);
437         if (!(neigh->nud_state & NUD_VALID) &&
438             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
439                 struct in6_addr mcaddr;
440                 struct in6_addr *target;
441
442                 neigh->updated = jiffies;
443                 read_unlock_bh(&neigh->lock);
444
445                 target = (struct in6_addr *)&neigh->primary_key;
446                 addrconf_addr_solict_mult(target, &mcaddr);
447                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
448         } else {
449                 read_unlock_bh(&neigh->lock);
450         }
451 out:
452         rcu_read_unlock();
453 }
454 #else
455 static inline void rt6_probe(struct rt6_info *rt)
456 {
457 }
458 #endif
459
460 /*
461  * Default Router Selection (RFC 2461 6.3.6)
462  */
463 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
464 {
465         struct net_device *dev = rt->dst.dev;
466         if (!oif || dev->ifindex == oif)
467                 return 2;
468         if ((dev->flags & IFF_LOOPBACK) &&
469             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
470                 return 1;
471         return 0;
472 }
473
474 static inline int rt6_check_neigh(struct rt6_info *rt)
475 {
476         struct neighbour *neigh;
477         int m;
478
479         rcu_read_lock();
480         neigh = dst_get_neighbour_noref(&rt->dst);
481         if (rt->rt6i_flags & RTF_NONEXTHOP ||
482             !(rt->rt6i_flags & RTF_GATEWAY))
483                 m = 1;
484         else if (neigh) {
485                 read_lock_bh(&neigh->lock);
486                 if (neigh->nud_state & NUD_VALID)
487                         m = 2;
488 #ifdef CONFIG_IPV6_ROUTER_PREF
489                 else if (neigh->nud_state & NUD_FAILED)
490                         m = 0;
491 #endif
492                 else
493                         m = 1;
494                 read_unlock_bh(&neigh->lock);
495         } else
496                 m = 0;
497         rcu_read_unlock();
498         return m;
499 }
500
501 static int rt6_score_route(struct rt6_info *rt, int oif,
502                            int strict)
503 {
504         int m, n;
505
506         m = rt6_check_dev(rt, oif);
507         if (!m && (strict & RT6_LOOKUP_F_IFACE))
508                 return -1;
509 #ifdef CONFIG_IPV6_ROUTER_PREF
510         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
511 #endif
512         n = rt6_check_neigh(rt);
513         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
514                 return -1;
515         return m;
516 }
517
518 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
519                                    int *mpri, struct rt6_info *match)
520 {
521         int m;
522
523         if (rt6_check_expired(rt))
524                 goto out;
525
526         m = rt6_score_route(rt, oif, strict);
527         if (m < 0)
528                 goto out;
529
530         if (m > *mpri) {
531                 if (strict & RT6_LOOKUP_F_REACHABLE)
532                         rt6_probe(match);
533                 *mpri = m;
534                 match = rt;
535         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
536                 rt6_probe(rt);
537         }
538
539 out:
540         return match;
541 }
542
543 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
544                                      struct rt6_info *rr_head,
545                                      u32 metric, int oif, int strict)
546 {
547         struct rt6_info *rt, *match;
548         int mpri = -1;
549
550         match = NULL;
551         for (rt = rr_head; rt && rt->rt6i_metric == metric;
552              rt = rt->dst.rt6_next)
553                 match = find_match(rt, oif, strict, &mpri, match);
554         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
555              rt = rt->dst.rt6_next)
556                 match = find_match(rt, oif, strict, &mpri, match);
557
558         return match;
559 }
560
561 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
562 {
563         struct rt6_info *match, *rt0;
564         struct net *net;
565
566         rt0 = fn->rr_ptr;
567         if (!rt0)
568                 fn->rr_ptr = rt0 = fn->leaf;
569
570         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
571
572         if (!match &&
573             (strict & RT6_LOOKUP_F_REACHABLE)) {
574                 struct rt6_info *next = rt0->dst.rt6_next;
575
576                 /* no entries matched; do round-robin */
577                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
578                         next = fn->leaf;
579
580                 if (next != rt0)
581                         fn->rr_ptr = next;
582         }
583
584         net = dev_net(rt0->dst.dev);
585         return match ? match : net->ipv6.ip6_null_entry;
586 }
587
588 #ifdef CONFIG_IPV6_ROUTE_INFO
589 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
590                   const struct in6_addr *gwaddr)
591 {
592         struct net *net = dev_net(dev);
593         struct route_info *rinfo = (struct route_info *) opt;
594         struct in6_addr prefix_buf, *prefix;
595         unsigned int pref;
596         unsigned long lifetime;
597         struct rt6_info *rt;
598
599         if (len < sizeof(struct route_info)) {
600                 return -EINVAL;
601         }
602
603         /* Sanity check for prefix_len and length */
604         if (rinfo->length > 3) {
605                 return -EINVAL;
606         } else if (rinfo->prefix_len > 128) {
607                 return -EINVAL;
608         } else if (rinfo->prefix_len > 64) {
609                 if (rinfo->length < 2) {
610                         return -EINVAL;
611                 }
612         } else if (rinfo->prefix_len > 0) {
613                 if (rinfo->length < 1) {
614                         return -EINVAL;
615                 }
616         }
617
618         pref = rinfo->route_pref;
619         if (pref == ICMPV6_ROUTER_PREF_INVALID)
620                 return -EINVAL;
621
622         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
623
624         if (rinfo->length == 3)
625                 prefix = (struct in6_addr *)rinfo->prefix;
626         else {
627                 /* this function is safe */
628                 ipv6_addr_prefix(&prefix_buf,
629                                  (struct in6_addr *)rinfo->prefix,
630                                  rinfo->prefix_len);
631                 prefix = &prefix_buf;
632         }
633
634         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
635                                 dev->ifindex);
636
637         if (rt && !lifetime) {
638                 ip6_del_rt(rt);
639                 rt = NULL;
640         }
641
642         if (!rt && lifetime)
643                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
644                                         pref);
645         else if (rt)
646                 rt->rt6i_flags = RTF_ROUTEINFO |
647                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
648
649         if (rt) {
650                 if (!addrconf_finite_timeout(lifetime))
651                         rt6_clean_expires(rt);
652                 else
653                         rt6_set_expires(rt, jiffies + HZ * lifetime);
654
655                 dst_release(&rt->dst);
656         }
657         return 0;
658 }
659 #endif
660
661 #define BACKTRACK(__net, saddr)                 \
662 do { \
663         if (rt == __net->ipv6.ip6_null_entry) { \
664                 struct fib6_node *pn; \
665                 while (1) { \
666                         if (fn->fn_flags & RTN_TL_ROOT) \
667                                 goto out; \
668                         pn = fn->parent; \
669                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
670                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
671                         else \
672                                 fn = pn; \
673                         if (fn->fn_flags & RTN_RTINFO) \
674                                 goto restart; \
675                 } \
676         } \
677 } while (0)
678
679 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
680                                              struct fib6_table *table,
681                                              struct flowi6 *fl6, int flags)
682 {
683         struct fib6_node *fn;
684         struct rt6_info *rt;
685
686         read_lock_bh(&table->tb6_lock);
687         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
688 restart:
689         rt = fn->leaf;
690         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
691         BACKTRACK(net, &fl6->saddr);
692 out:
693         dst_use(&rt->dst, jiffies);
694         read_unlock_bh(&table->tb6_lock);
695         return rt;
696
697 }
698
699 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
700                                     int flags)
701 {
702         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
703 }
704 EXPORT_SYMBOL_GPL(ip6_route_lookup);
705
706 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
707                             const struct in6_addr *saddr, int oif, int strict)
708 {
709         struct flowi6 fl6 = {
710                 .flowi6_oif = oif,
711                 .daddr = *daddr,
712         };
713         struct dst_entry *dst;
714         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
715
716         if (saddr) {
717                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
718                 flags |= RT6_LOOKUP_F_HAS_SADDR;
719         }
720
721         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
722         if (dst->error == 0)
723                 return (struct rt6_info *) dst;
724
725         dst_release(dst);
726
727         return NULL;
728 }
729
730 EXPORT_SYMBOL(rt6_lookup);
731
732 /* ip6_ins_rt is called with FREE table->tb6_lock.
733    It takes new route entry, the addition fails by any reason the
734    route is freed. In any case, if caller does not hold it, it may
735    be destroyed.
736  */
737
738 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
739 {
740         int err;
741         struct fib6_table *table;
742
743         table = rt->rt6i_table;
744         write_lock_bh(&table->tb6_lock);
745         err = fib6_add(&table->tb6_root, rt, info);
746         write_unlock_bh(&table->tb6_lock);
747
748         return err;
749 }
750
751 int ip6_ins_rt(struct rt6_info *rt)
752 {
753         struct nl_info info = {
754                 .nl_net = dev_net(rt->dst.dev),
755         };
756         return __ip6_ins_rt(rt, &info);
757 }
758
759 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
760                                       const struct in6_addr *daddr,
761                                       const struct in6_addr *saddr)
762 {
763         struct rt6_info *rt;
764
765         /*
766          *      Clone the route.
767          */
768
769         rt = ip6_rt_copy(ort, daddr);
770
771         if (rt) {
772                 int attempts = !in_softirq();
773
774                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
775                         if (ort->rt6i_dst.plen != 128 &&
776                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
777                                 rt->rt6i_flags |= RTF_ANYCAST;
778                         rt->rt6i_gateway = *daddr;
779                 }
780
781                 rt->rt6i_flags |= RTF_CACHE;
782
783 #ifdef CONFIG_IPV6_SUBTREES
784                 if (rt->rt6i_src.plen && saddr) {
785                         rt->rt6i_src.addr = *saddr;
786                         rt->rt6i_src.plen = 128;
787                 }
788 #endif
789
790         retry:
791                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
792                         struct net *net = dev_net(rt->dst.dev);
793                         int saved_rt_min_interval =
794                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
795                         int saved_rt_elasticity =
796                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
797
798                         if (attempts-- > 0) {
799                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
800                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
801
802                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
803
804                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
805                                         saved_rt_elasticity;
806                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
807                                         saved_rt_min_interval;
808                                 goto retry;
809                         }
810
811                         net_warn_ratelimited("Neighbour table overflow\n");
812                         dst_free(&rt->dst);
813                         return NULL;
814                 }
815         }
816
817         return rt;
818 }
819
820 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
821                                         const struct in6_addr *daddr)
822 {
823         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
824
825         if (rt) {
826                 rt->rt6i_flags |= RTF_CACHE;
827                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
828         }
829         return rt;
830 }
831
832 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
833                                       struct flowi6 *fl6, int flags)
834 {
835         struct fib6_node *fn;
836         struct rt6_info *rt, *nrt;
837         int strict = 0;
838         int attempts = 3;
839         int err;
840         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
841
842         strict |= flags & RT6_LOOKUP_F_IFACE;
843
844 relookup:
845         read_lock_bh(&table->tb6_lock);
846
847 restart_2:
848         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
849
850 restart:
851         rt = rt6_select(fn, oif, strict | reachable);
852
853         BACKTRACK(net, &fl6->saddr);
854         if (rt == net->ipv6.ip6_null_entry ||
855             rt->rt6i_flags & RTF_CACHE)
856                 goto out;
857
858         dst_hold(&rt->dst);
859         read_unlock_bh(&table->tb6_lock);
860
861         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
862                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
863         else if (!(rt->dst.flags & DST_HOST))
864                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
865         else
866                 goto out2;
867
868         dst_release(&rt->dst);
869         rt = nrt ? : net->ipv6.ip6_null_entry;
870
871         dst_hold(&rt->dst);
872         if (nrt) {
873                 err = ip6_ins_rt(nrt);
874                 if (!err)
875                         goto out2;
876         }
877
878         if (--attempts <= 0)
879                 goto out2;
880
881         /*
882          * Race condition! In the gap, when table->tb6_lock was
883          * released someone could insert this route.  Relookup.
884          */
885         dst_release(&rt->dst);
886         goto relookup;
887
888 out:
889         if (reachable) {
890                 reachable = 0;
891                 goto restart_2;
892         }
893         dst_hold(&rt->dst);
894         read_unlock_bh(&table->tb6_lock);
895 out2:
896         rt->dst.lastuse = jiffies;
897         rt->dst.__use++;
898
899         return rt;
900 }
901
902 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
903                                             struct flowi6 *fl6, int flags)
904 {
905         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
906 }
907
908 static struct dst_entry *ip6_route_input_lookup(struct net *net,
909                                                 struct net_device *dev,
910                                                 struct flowi6 *fl6, int flags)
911 {
912         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
913                 flags |= RT6_LOOKUP_F_IFACE;
914
915         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
916 }
917
918 void ip6_route_input(struct sk_buff *skb)
919 {
920         const struct ipv6hdr *iph = ipv6_hdr(skb);
921         struct net *net = dev_net(skb->dev);
922         int flags = RT6_LOOKUP_F_HAS_SADDR;
923         struct flowi6 fl6 = {
924                 .flowi6_iif = skb->dev->ifindex,
925                 .daddr = iph->daddr,
926                 .saddr = iph->saddr,
927                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
928                 .flowi6_mark = skb->mark,
929                 .flowi6_proto = iph->nexthdr,
930         };
931
932         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
933 }
934
935 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
936                                              struct flowi6 *fl6, int flags)
937 {
938         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
939 }
940
941 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
942                                     struct flowi6 *fl6)
943 {
944         int flags = 0;
945
946         fl6->flowi6_iif = net->loopback_dev->ifindex;
947
948         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
949                 flags |= RT6_LOOKUP_F_IFACE;
950
951         if (!ipv6_addr_any(&fl6->saddr))
952                 flags |= RT6_LOOKUP_F_HAS_SADDR;
953         else if (sk)
954                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
955
956         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
957 }
958
959 EXPORT_SYMBOL(ip6_route_output);
960
961 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
962 {
963         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
964         struct dst_entry *new = NULL;
965
966         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
967         if (rt) {
968                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
969                 rt6_init_peer(rt, net->ipv6.peers);
970
971                 new = &rt->dst;
972
973                 new->__use = 1;
974                 new->input = dst_discard;
975                 new->output = dst_discard;
976
977                 if (dst_metrics_read_only(&ort->dst))
978                         new->_metrics = ort->dst._metrics;
979                 else
980                         dst_copy_metrics(new, &ort->dst);
981                 rt->rt6i_idev = ort->rt6i_idev;
982                 if (rt->rt6i_idev)
983                         in6_dev_hold(rt->rt6i_idev);
984
985                 rt->rt6i_gateway = ort->rt6i_gateway;
986                 rt->rt6i_flags = ort->rt6i_flags;
987                 rt6_clean_expires(rt);
988                 rt->rt6i_metric = 0;
989
990                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
991 #ifdef CONFIG_IPV6_SUBTREES
992                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
993 #endif
994
995                 dst_free(new);
996         }
997
998         dst_release(dst_orig);
999         return new ? new : ERR_PTR(-ENOMEM);
1000 }
1001
1002 /*
1003  *      Destination cache support functions
1004  */
1005
1006 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1007 {
1008         struct rt6_info *rt;
1009
1010         rt = (struct rt6_info *) dst;
1011
1012         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1013                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1014                         if (!rt6_has_peer(rt))
1015                                 rt6_bind_peer(rt, 0);
1016                         rt->rt6i_peer_genid = rt6_peer_genid();
1017                 }
1018                 return dst;
1019         }
1020         return NULL;
1021 }
1022
1023 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1024 {
1025         struct rt6_info *rt = (struct rt6_info *) dst;
1026
1027         if (rt) {
1028                 if (rt->rt6i_flags & RTF_CACHE) {
1029                         if (rt6_check_expired(rt)) {
1030                                 ip6_del_rt(rt);
1031                                 dst = NULL;
1032                         }
1033                 } else {
1034                         dst_release(dst);
1035                         dst = NULL;
1036                 }
1037         }
1038         return dst;
1039 }
1040
1041 static void ip6_link_failure(struct sk_buff *skb)
1042 {
1043         struct rt6_info *rt;
1044
1045         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1046
1047         rt = (struct rt6_info *) skb_dst(skb);
1048         if (rt) {
1049                 if (rt->rt6i_flags & RTF_CACHE)
1050                         rt6_update_expires(rt, 0);
1051                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1052                         rt->rt6i_node->fn_sernum = -1;
1053         }
1054 }
1055
1056 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1057 {
1058         struct rt6_info *rt6 = (struct rt6_info*)dst;
1059
1060         dst_confirm(dst);
1061         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1062                 struct net *net = dev_net(dst->dev);
1063
1064                 rt6->rt6i_flags |= RTF_MODIFIED;
1065                 if (mtu < IPV6_MIN_MTU) {
1066                         u32 features = dst_metric(dst, RTAX_FEATURES);
1067                         mtu = IPV6_MIN_MTU;
1068                         features |= RTAX_FEATURE_ALLFRAG;
1069                         dst_metric_set(dst, RTAX_FEATURES, features);
1070                 }
1071                 dst_metric_set(dst, RTAX_MTU, mtu);
1072                 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1073         }
1074 }
1075
1076 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1077                      int oif, u32 mark)
1078 {
1079         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1080         struct dst_entry *dst;
1081         struct flowi6 fl6;
1082
1083         memset(&fl6, 0, sizeof(fl6));
1084         fl6.flowi6_oif = oif;
1085         fl6.flowi6_mark = mark;
1086         fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS;
1087         fl6.daddr = iph->daddr;
1088         fl6.saddr = iph->saddr;
1089         fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1090
1091         dst = ip6_route_output(net, NULL, &fl6);
1092         if (!dst->error)
1093                 ip6_rt_update_pmtu(dst, ntohl(mtu));
1094         dst_release(dst);
1095 }
1096 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1097
1098 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1099 {
1100         ip6_update_pmtu(skb, sock_net(sk), mtu,
1101                         sk->sk_bound_dev_if, sk->sk_mark);
1102 }
1103 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1104
1105 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1106 {
1107         struct net_device *dev = dst->dev;
1108         unsigned int mtu = dst_mtu(dst);
1109         struct net *net = dev_net(dev);
1110
1111         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1112
1113         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1114                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1115
1116         /*
1117          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1118          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1119          * IPV6_MAXPLEN is also valid and means: "any MSS,
1120          * rely only on pmtu discovery"
1121          */
1122         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1123                 mtu = IPV6_MAXPLEN;
1124         return mtu;
1125 }
1126
1127 static unsigned int ip6_mtu(const struct dst_entry *dst)
1128 {
1129         struct inet6_dev *idev;
1130         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1131
1132         if (mtu)
1133                 return mtu;
1134
1135         mtu = IPV6_MIN_MTU;
1136
1137         rcu_read_lock();
1138         idev = __in6_dev_get(dst->dev);
1139         if (idev)
1140                 mtu = idev->cnf.mtu6;
1141         rcu_read_unlock();
1142
1143         return mtu;
1144 }
1145
1146 static struct dst_entry *icmp6_dst_gc_list;
1147 static DEFINE_SPINLOCK(icmp6_dst_lock);
1148
1149 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1150                                   struct neighbour *neigh,
1151                                   struct flowi6 *fl6)
1152 {
1153         struct dst_entry *dst;
1154         struct rt6_info *rt;
1155         struct inet6_dev *idev = in6_dev_get(dev);
1156         struct net *net = dev_net(dev);
1157
1158         if (unlikely(!idev))
1159                 return ERR_PTR(-ENODEV);
1160
1161         rt = ip6_dst_alloc(net, dev, 0, NULL);
1162         if (unlikely(!rt)) {
1163                 in6_dev_put(idev);
1164                 dst = ERR_PTR(-ENOMEM);
1165                 goto out;
1166         }
1167
1168         if (neigh)
1169                 neigh_hold(neigh);
1170         else {
1171                 neigh = ip6_neigh_lookup(&rt->dst, NULL, &fl6->daddr);
1172                 if (IS_ERR(neigh)) {
1173                         in6_dev_put(idev);
1174                         dst_free(&rt->dst);
1175                         return ERR_CAST(neigh);
1176                 }
1177         }
1178
1179         rt->dst.flags |= DST_HOST;
1180         rt->dst.output  = ip6_output;
1181         dst_set_neighbour(&rt->dst, neigh);
1182         atomic_set(&rt->dst.__refcnt, 1);
1183         rt->rt6i_dst.addr = fl6->daddr;
1184         rt->rt6i_dst.plen = 128;
1185         rt->rt6i_idev     = idev;
1186         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1187
1188         spin_lock_bh(&icmp6_dst_lock);
1189         rt->dst.next = icmp6_dst_gc_list;
1190         icmp6_dst_gc_list = &rt->dst;
1191         spin_unlock_bh(&icmp6_dst_lock);
1192
1193         fib6_force_start_gc(net);
1194
1195         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1196
1197 out:
1198         return dst;
1199 }
1200
1201 int icmp6_dst_gc(void)
1202 {
1203         struct dst_entry *dst, **pprev;
1204         int more = 0;
1205
1206         spin_lock_bh(&icmp6_dst_lock);
1207         pprev = &icmp6_dst_gc_list;
1208
1209         while ((dst = *pprev) != NULL) {
1210                 if (!atomic_read(&dst->__refcnt)) {
1211                         *pprev = dst->next;
1212                         dst_free(dst);
1213                 } else {
1214                         pprev = &dst->next;
1215                         ++more;
1216                 }
1217         }
1218
1219         spin_unlock_bh(&icmp6_dst_lock);
1220
1221         return more;
1222 }
1223
1224 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1225                             void *arg)
1226 {
1227         struct dst_entry *dst, **pprev;
1228
1229         spin_lock_bh(&icmp6_dst_lock);
1230         pprev = &icmp6_dst_gc_list;
1231         while ((dst = *pprev) != NULL) {
1232                 struct rt6_info *rt = (struct rt6_info *) dst;
1233                 if (func(rt, arg)) {
1234                         *pprev = dst->next;
1235                         dst_free(dst);
1236                 } else {
1237                         pprev = &dst->next;
1238                 }
1239         }
1240         spin_unlock_bh(&icmp6_dst_lock);
1241 }
1242
1243 static int ip6_dst_gc(struct dst_ops *ops)
1244 {
1245         unsigned long now = jiffies;
1246         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1247         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1248         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1249         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1250         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1251         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1252         int entries;
1253
1254         entries = dst_entries_get_fast(ops);
1255         if (time_after(rt_last_gc + rt_min_interval, now) &&
1256             entries <= rt_max_size)
1257                 goto out;
1258
1259         net->ipv6.ip6_rt_gc_expire++;
1260         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1261         net->ipv6.ip6_rt_last_gc = now;
1262         entries = dst_entries_get_slow(ops);
1263         if (entries < ops->gc_thresh)
1264                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1265 out:
1266         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1267         return entries > rt_max_size;
1268 }
1269
1270 /* Clean host part of a prefix. Not necessary in radix tree,
1271    but results in cleaner routing tables.
1272
1273    Remove it only when all the things will work!
1274  */
1275
1276 int ip6_dst_hoplimit(struct dst_entry *dst)
1277 {
1278         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1279         if (hoplimit == 0) {
1280                 struct net_device *dev = dst->dev;
1281                 struct inet6_dev *idev;
1282
1283                 rcu_read_lock();
1284                 idev = __in6_dev_get(dev);
1285                 if (idev)
1286                         hoplimit = idev->cnf.hop_limit;
1287                 else
1288                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1289                 rcu_read_unlock();
1290         }
1291         return hoplimit;
1292 }
1293 EXPORT_SYMBOL(ip6_dst_hoplimit);
1294
1295 /*
1296  *
1297  */
1298
1299 int ip6_route_add(struct fib6_config *cfg)
1300 {
1301         int err;
1302         struct net *net = cfg->fc_nlinfo.nl_net;
1303         struct rt6_info *rt = NULL;
1304         struct net_device *dev = NULL;
1305         struct inet6_dev *idev = NULL;
1306         struct fib6_table *table;
1307         int addr_type;
1308
1309         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1310                 return -EINVAL;
1311 #ifndef CONFIG_IPV6_SUBTREES
1312         if (cfg->fc_src_len)
1313                 return -EINVAL;
1314 #endif
1315         if (cfg->fc_ifindex) {
1316                 err = -ENODEV;
1317                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1318                 if (!dev)
1319                         goto out;
1320                 idev = in6_dev_get(dev);
1321                 if (!idev)
1322                         goto out;
1323         }
1324
1325         if (cfg->fc_metric == 0)
1326                 cfg->fc_metric = IP6_RT_PRIO_USER;
1327
1328         err = -ENOBUFS;
1329         if (cfg->fc_nlinfo.nlh &&
1330             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1331                 table = fib6_get_table(net, cfg->fc_table);
1332                 if (!table) {
1333                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1334                         table = fib6_new_table(net, cfg->fc_table);
1335                 }
1336         } else {
1337                 table = fib6_new_table(net, cfg->fc_table);
1338         }
1339
1340         if (!table)
1341                 goto out;
1342
1343         rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1344
1345         if (!rt) {
1346                 err = -ENOMEM;
1347                 goto out;
1348         }
1349
1350         rt->dst.obsolete = -1;
1351
1352         if (cfg->fc_flags & RTF_EXPIRES)
1353                 rt6_set_expires(rt, jiffies +
1354                                 clock_t_to_jiffies(cfg->fc_expires));
1355         else
1356                 rt6_clean_expires(rt);
1357
1358         if (cfg->fc_protocol == RTPROT_UNSPEC)
1359                 cfg->fc_protocol = RTPROT_BOOT;
1360         rt->rt6i_protocol = cfg->fc_protocol;
1361
1362         addr_type = ipv6_addr_type(&cfg->fc_dst);
1363
1364         if (addr_type & IPV6_ADDR_MULTICAST)
1365                 rt->dst.input = ip6_mc_input;
1366         else if (cfg->fc_flags & RTF_LOCAL)
1367                 rt->dst.input = ip6_input;
1368         else
1369                 rt->dst.input = ip6_forward;
1370
1371         rt->dst.output = ip6_output;
1372
1373         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1374         rt->rt6i_dst.plen = cfg->fc_dst_len;
1375         if (rt->rt6i_dst.plen == 128)
1376                rt->dst.flags |= DST_HOST;
1377
1378         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1379                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1380                 if (!metrics) {
1381                         err = -ENOMEM;
1382                         goto out;
1383                 }
1384                 dst_init_metrics(&rt->dst, metrics, 0);
1385         }
1386 #ifdef CONFIG_IPV6_SUBTREES
1387         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1388         rt->rt6i_src.plen = cfg->fc_src_len;
1389 #endif
1390
1391         rt->rt6i_metric = cfg->fc_metric;
1392
1393         /* We cannot add true routes via loopback here,
1394            they would result in kernel looping; promote them to reject routes
1395          */
1396         if ((cfg->fc_flags & RTF_REJECT) ||
1397             (dev && (dev->flags & IFF_LOOPBACK) &&
1398              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1399              !(cfg->fc_flags & RTF_LOCAL))) {
1400                 /* hold loopback dev/idev if we haven't done so. */
1401                 if (dev != net->loopback_dev) {
1402                         if (dev) {
1403                                 dev_put(dev);
1404                                 in6_dev_put(idev);
1405                         }
1406                         dev = net->loopback_dev;
1407                         dev_hold(dev);
1408                         idev = in6_dev_get(dev);
1409                         if (!idev) {
1410                                 err = -ENODEV;
1411                                 goto out;
1412                         }
1413                 }
1414                 rt->dst.output = ip6_pkt_discard_out;
1415                 rt->dst.input = ip6_pkt_discard;
1416                 rt->dst.error = -ENETUNREACH;
1417                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1418                 goto install_route;
1419         }
1420
1421         if (cfg->fc_flags & RTF_GATEWAY) {
1422                 const struct in6_addr *gw_addr;
1423                 int gwa_type;
1424
1425                 gw_addr = &cfg->fc_gateway;
1426                 rt->rt6i_gateway = *gw_addr;
1427                 gwa_type = ipv6_addr_type(gw_addr);
1428
1429                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1430                         struct rt6_info *grt;
1431
1432                         /* IPv6 strictly inhibits using not link-local
1433                            addresses as nexthop address.
1434                            Otherwise, router will not able to send redirects.
1435                            It is very good, but in some (rare!) circumstances
1436                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1437                            some exceptions. --ANK
1438                          */
1439                         err = -EINVAL;
1440                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1441                                 goto out;
1442
1443                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1444
1445                         err = -EHOSTUNREACH;
1446                         if (!grt)
1447                                 goto out;
1448                         if (dev) {
1449                                 if (dev != grt->dst.dev) {
1450                                         dst_release(&grt->dst);
1451                                         goto out;
1452                                 }
1453                         } else {
1454                                 dev = grt->dst.dev;
1455                                 idev = grt->rt6i_idev;
1456                                 dev_hold(dev);
1457                                 in6_dev_hold(grt->rt6i_idev);
1458                         }
1459                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1460                                 err = 0;
1461                         dst_release(&grt->dst);
1462
1463                         if (err)
1464                                 goto out;
1465                 }
1466                 err = -EINVAL;
1467                 if (!dev || (dev->flags & IFF_LOOPBACK))
1468                         goto out;
1469         }
1470
1471         err = -ENODEV;
1472         if (!dev)
1473                 goto out;
1474
1475         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1476                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1477                         err = -EINVAL;
1478                         goto out;
1479                 }
1480                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1481                 rt->rt6i_prefsrc.plen = 128;
1482         } else
1483                 rt->rt6i_prefsrc.plen = 0;
1484
1485         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1486                 err = rt6_bind_neighbour(rt, dev);
1487                 if (err)
1488                         goto out;
1489         }
1490
1491         rt->rt6i_flags = cfg->fc_flags;
1492
1493 install_route:
1494         if (cfg->fc_mx) {
1495                 struct nlattr *nla;
1496                 int remaining;
1497
1498                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1499                         int type = nla_type(nla);
1500
1501                         if (type) {
1502                                 if (type > RTAX_MAX) {
1503                                         err = -EINVAL;
1504                                         goto out;
1505                                 }
1506
1507                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1508                         }
1509                 }
1510         }
1511
1512         rt->dst.dev = dev;
1513         rt->rt6i_idev = idev;
1514         rt->rt6i_table = table;
1515
1516         cfg->fc_nlinfo.nl_net = dev_net(dev);
1517
1518         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1519
1520 out:
1521         if (dev)
1522                 dev_put(dev);
1523         if (idev)
1524                 in6_dev_put(idev);
1525         if (rt)
1526                 dst_free(&rt->dst);
1527         return err;
1528 }
1529
1530 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1531 {
1532         int err;
1533         struct fib6_table *table;
1534         struct net *net = dev_net(rt->dst.dev);
1535
1536         if (rt == net->ipv6.ip6_null_entry)
1537                 return -ENOENT;
1538
1539         table = rt->rt6i_table;
1540         write_lock_bh(&table->tb6_lock);
1541
1542         err = fib6_del(rt, info);
1543         dst_release(&rt->dst);
1544
1545         write_unlock_bh(&table->tb6_lock);
1546
1547         return err;
1548 }
1549
1550 int ip6_del_rt(struct rt6_info *rt)
1551 {
1552         struct nl_info info = {
1553                 .nl_net = dev_net(rt->dst.dev),
1554         };
1555         return __ip6_del_rt(rt, &info);
1556 }
1557
1558 static int ip6_route_del(struct fib6_config *cfg)
1559 {
1560         struct fib6_table *table;
1561         struct fib6_node *fn;
1562         struct rt6_info *rt;
1563         int err = -ESRCH;
1564
1565         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1566         if (!table)
1567                 return err;
1568
1569         read_lock_bh(&table->tb6_lock);
1570
1571         fn = fib6_locate(&table->tb6_root,
1572                          &cfg->fc_dst, cfg->fc_dst_len,
1573                          &cfg->fc_src, cfg->fc_src_len);
1574
1575         if (fn) {
1576                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1577                         if (cfg->fc_ifindex &&
1578                             (!rt->dst.dev ||
1579                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1580                                 continue;
1581                         if (cfg->fc_flags & RTF_GATEWAY &&
1582                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1583                                 continue;
1584                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1585                                 continue;
1586                         dst_hold(&rt->dst);
1587                         read_unlock_bh(&table->tb6_lock);
1588
1589                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1590                 }
1591         }
1592         read_unlock_bh(&table->tb6_lock);
1593
1594         return err;
1595 }
1596
1597 /*
1598  *      Handle redirects
1599  */
1600 struct ip6rd_flowi {
1601         struct flowi6 fl6;
1602         struct in6_addr gateway;
1603 };
1604
1605 static struct rt6_info *__ip6_route_redirect(struct net *net,
1606                                              struct fib6_table *table,
1607                                              struct flowi6 *fl6,
1608                                              int flags)
1609 {
1610         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1611         struct rt6_info *rt;
1612         struct fib6_node *fn;
1613
1614         /*
1615          * Get the "current" route for this destination and
1616          * check if the redirect has come from approriate router.
1617          *
1618          * RFC 2461 specifies that redirects should only be
1619          * accepted if they come from the nexthop to the target.
1620          * Due to the way the routes are chosen, this notion
1621          * is a bit fuzzy and one might need to check all possible
1622          * routes.
1623          */
1624
1625         read_lock_bh(&table->tb6_lock);
1626         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1627 restart:
1628         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1629                 /*
1630                  * Current route is on-link; redirect is always invalid.
1631                  *
1632                  * Seems, previous statement is not true. It could
1633                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1634                  * But then router serving it might decide, that we should
1635                  * know truth 8)8) --ANK (980726).
1636                  */
1637                 if (rt6_check_expired(rt))
1638                         continue;
1639                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1640                         continue;
1641                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1642                         continue;
1643                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1644                         continue;
1645                 break;
1646         }
1647
1648         if (!rt)
1649                 rt = net->ipv6.ip6_null_entry;
1650         BACKTRACK(net, &fl6->saddr);
1651 out:
1652         dst_hold(&rt->dst);
1653
1654         read_unlock_bh(&table->tb6_lock);
1655
1656         return rt;
1657 };
1658
1659 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1660                                            const struct in6_addr *src,
1661                                            const struct in6_addr *gateway,
1662                                            struct net_device *dev)
1663 {
1664         int flags = RT6_LOOKUP_F_HAS_SADDR;
1665         struct net *net = dev_net(dev);
1666         struct ip6rd_flowi rdfl = {
1667                 .fl6 = {
1668                         .flowi6_oif = dev->ifindex,
1669                         .daddr = *dest,
1670                         .saddr = *src,
1671                 },
1672         };
1673
1674         rdfl.gateway = *gateway;
1675
1676         if (rt6_need_strict(dest))
1677                 flags |= RT6_LOOKUP_F_IFACE;
1678
1679         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1680                                                    flags, __ip6_route_redirect);
1681 }
1682
1683 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1684                   const struct in6_addr *saddr,
1685                   struct neighbour *neigh, u8 *lladdr, int on_link)
1686 {
1687         struct rt6_info *rt, *nrt = NULL;
1688         struct netevent_redirect netevent;
1689         struct net *net = dev_net(neigh->dev);
1690
1691         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1692
1693         if (rt == net->ipv6.ip6_null_entry) {
1694                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1695                 goto out;
1696         }
1697
1698         /*
1699          *      We have finally decided to accept it.
1700          */
1701
1702         neigh_update(neigh, lladdr, NUD_STALE,
1703                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1704                      NEIGH_UPDATE_F_OVERRIDE|
1705                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1706                                      NEIGH_UPDATE_F_ISROUTER))
1707                      );
1708
1709         /*
1710          * Redirect received -> path was valid.
1711          * Look, redirects are sent only in response to data packets,
1712          * so that this nexthop apparently is reachable. --ANK
1713          */
1714         dst_confirm(&rt->dst);
1715
1716         /* Duplicate redirect: silently ignore. */
1717         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1718                 goto out;
1719
1720         nrt = ip6_rt_copy(rt, dest);
1721         if (!nrt)
1722                 goto out;
1723
1724         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1725         if (on_link)
1726                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1727
1728         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1729         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1730
1731         if (ip6_ins_rt(nrt))
1732                 goto out;
1733
1734         netevent.old = &rt->dst;
1735         netevent.new = &nrt->dst;
1736         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1737
1738         if (rt->rt6i_flags & RTF_CACHE) {
1739                 ip6_del_rt(rt);
1740                 return;
1741         }
1742
1743 out:
1744         dst_release(&rt->dst);
1745 }
1746
1747 /*
1748  *      Misc support functions
1749  */
1750
1751 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1752                                     const struct in6_addr *dest)
1753 {
1754         struct net *net = dev_net(ort->dst.dev);
1755         struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1756                                             ort->rt6i_table);
1757
1758         if (rt) {
1759                 rt->dst.input = ort->dst.input;
1760                 rt->dst.output = ort->dst.output;
1761                 rt->dst.flags |= DST_HOST;
1762
1763                 rt->rt6i_dst.addr = *dest;
1764                 rt->rt6i_dst.plen = 128;
1765                 dst_copy_metrics(&rt->dst, &ort->dst);
1766                 rt->dst.error = ort->dst.error;
1767                 rt->rt6i_idev = ort->rt6i_idev;
1768                 if (rt->rt6i_idev)
1769                         in6_dev_hold(rt->rt6i_idev);
1770                 rt->dst.lastuse = jiffies;
1771
1772                 rt->rt6i_gateway = ort->rt6i_gateway;
1773                 rt->rt6i_flags = ort->rt6i_flags;
1774                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1775                     (RTF_DEFAULT | RTF_ADDRCONF))
1776                         rt6_set_from(rt, ort);
1777                 else
1778                         rt6_clean_expires(rt);
1779                 rt->rt6i_metric = 0;
1780
1781 #ifdef CONFIG_IPV6_SUBTREES
1782                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1783 #endif
1784                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1785                 rt->rt6i_table = ort->rt6i_table;
1786         }
1787         return rt;
1788 }
1789
1790 #ifdef CONFIG_IPV6_ROUTE_INFO
1791 static struct rt6_info *rt6_get_route_info(struct net *net,
1792                                            const struct in6_addr *prefix, int prefixlen,
1793                                            const struct in6_addr *gwaddr, int ifindex)
1794 {
1795         struct fib6_node *fn;
1796         struct rt6_info *rt = NULL;
1797         struct fib6_table *table;
1798
1799         table = fib6_get_table(net, RT6_TABLE_INFO);
1800         if (!table)
1801                 return NULL;
1802
1803         write_lock_bh(&table->tb6_lock);
1804         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1805         if (!fn)
1806                 goto out;
1807
1808         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1809                 if (rt->dst.dev->ifindex != ifindex)
1810                         continue;
1811                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1812                         continue;
1813                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1814                         continue;
1815                 dst_hold(&rt->dst);
1816                 break;
1817         }
1818 out:
1819         write_unlock_bh(&table->tb6_lock);
1820         return rt;
1821 }
1822
1823 static struct rt6_info *rt6_add_route_info(struct net *net,
1824                                            const struct in6_addr *prefix, int prefixlen,
1825                                            const struct in6_addr *gwaddr, int ifindex,
1826                                            unsigned int pref)
1827 {
1828         struct fib6_config cfg = {
1829                 .fc_table       = RT6_TABLE_INFO,
1830                 .fc_metric      = IP6_RT_PRIO_USER,
1831                 .fc_ifindex     = ifindex,
1832                 .fc_dst_len     = prefixlen,
1833                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1834                                   RTF_UP | RTF_PREF(pref),
1835                 .fc_nlinfo.pid = 0,
1836                 .fc_nlinfo.nlh = NULL,
1837                 .fc_nlinfo.nl_net = net,
1838         };
1839
1840         cfg.fc_dst = *prefix;
1841         cfg.fc_gateway = *gwaddr;
1842
1843         /* We should treat it as a default route if prefix length is 0. */
1844         if (!prefixlen)
1845                 cfg.fc_flags |= RTF_DEFAULT;
1846
1847         ip6_route_add(&cfg);
1848
1849         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1850 }
1851 #endif
1852
1853 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1854 {
1855         struct rt6_info *rt;
1856         struct fib6_table *table;
1857
1858         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1859         if (!table)
1860                 return NULL;
1861
1862         write_lock_bh(&table->tb6_lock);
1863         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1864                 if (dev == rt->dst.dev &&
1865                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1866                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1867                         break;
1868         }
1869         if (rt)
1870                 dst_hold(&rt->dst);
1871         write_unlock_bh(&table->tb6_lock);
1872         return rt;
1873 }
1874
1875 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1876                                      struct net_device *dev,
1877                                      unsigned int pref)
1878 {
1879         struct fib6_config cfg = {
1880                 .fc_table       = RT6_TABLE_DFLT,
1881                 .fc_metric      = IP6_RT_PRIO_USER,
1882                 .fc_ifindex     = dev->ifindex,
1883                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1884                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1885                 .fc_nlinfo.pid = 0,
1886                 .fc_nlinfo.nlh = NULL,
1887                 .fc_nlinfo.nl_net = dev_net(dev),
1888         };
1889
1890         cfg.fc_gateway = *gwaddr;
1891
1892         ip6_route_add(&cfg);
1893
1894         return rt6_get_dflt_router(gwaddr, dev);
1895 }
1896
1897 void rt6_purge_dflt_routers(struct net *net)
1898 {
1899         struct rt6_info *rt;
1900         struct fib6_table *table;
1901
1902         /* NOTE: Keep consistent with rt6_get_dflt_router */
1903         table = fib6_get_table(net, RT6_TABLE_DFLT);
1904         if (!table)
1905                 return;
1906
1907 restart:
1908         read_lock_bh(&table->tb6_lock);
1909         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1910                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1911                         dst_hold(&rt->dst);
1912                         read_unlock_bh(&table->tb6_lock);
1913                         ip6_del_rt(rt);
1914                         goto restart;
1915                 }
1916         }
1917         read_unlock_bh(&table->tb6_lock);
1918 }
1919
1920 static void rtmsg_to_fib6_config(struct net *net,
1921                                  struct in6_rtmsg *rtmsg,
1922                                  struct fib6_config *cfg)
1923 {
1924         memset(cfg, 0, sizeof(*cfg));
1925
1926         cfg->fc_table = RT6_TABLE_MAIN;
1927         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1928         cfg->fc_metric = rtmsg->rtmsg_metric;
1929         cfg->fc_expires = rtmsg->rtmsg_info;
1930         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1931         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1932         cfg->fc_flags = rtmsg->rtmsg_flags;
1933
1934         cfg->fc_nlinfo.nl_net = net;
1935
1936         cfg->fc_dst = rtmsg->rtmsg_dst;
1937         cfg->fc_src = rtmsg->rtmsg_src;
1938         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1939 }
1940
1941 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1942 {
1943         struct fib6_config cfg;
1944         struct in6_rtmsg rtmsg;
1945         int err;
1946
1947         switch(cmd) {
1948         case SIOCADDRT:         /* Add a route */
1949         case SIOCDELRT:         /* Delete a route */
1950                 if (!capable(CAP_NET_ADMIN))
1951                         return -EPERM;
1952                 err = copy_from_user(&rtmsg, arg,
1953                                      sizeof(struct in6_rtmsg));
1954                 if (err)
1955                         return -EFAULT;
1956
1957                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1958
1959                 rtnl_lock();
1960                 switch (cmd) {
1961                 case SIOCADDRT:
1962                         err = ip6_route_add(&cfg);
1963                         break;
1964                 case SIOCDELRT:
1965                         err = ip6_route_del(&cfg);
1966                         break;
1967                 default:
1968                         err = -EINVAL;
1969                 }
1970                 rtnl_unlock();
1971
1972                 return err;
1973         }
1974
1975         return -EINVAL;
1976 }
1977
1978 /*
1979  *      Drop the packet on the floor
1980  */
1981
1982 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1983 {
1984         int type;
1985         struct dst_entry *dst = skb_dst(skb);
1986         switch (ipstats_mib_noroutes) {
1987         case IPSTATS_MIB_INNOROUTES:
1988                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1989                 if (type == IPV6_ADDR_ANY) {
1990                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1991                                       IPSTATS_MIB_INADDRERRORS);
1992                         break;
1993                 }
1994                 /* FALLTHROUGH */
1995         case IPSTATS_MIB_OUTNOROUTES:
1996                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1997                               ipstats_mib_noroutes);
1998                 break;
1999         }
2000         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2001         kfree_skb(skb);
2002         return 0;
2003 }
2004
2005 static int ip6_pkt_discard(struct sk_buff *skb)
2006 {
2007         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2008 }
2009
2010 static int ip6_pkt_discard_out(struct sk_buff *skb)
2011 {
2012         skb->dev = skb_dst(skb)->dev;
2013         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2014 }
2015
2016 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2017
2018 static int ip6_pkt_prohibit(struct sk_buff *skb)
2019 {
2020         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2021 }
2022
2023 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2024 {
2025         skb->dev = skb_dst(skb)->dev;
2026         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2027 }
2028
2029 #endif
2030
2031 /*
2032  *      Allocate a dst for local (unicast / anycast) address.
2033  */
2034
2035 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2036                                     const struct in6_addr *addr,
2037                                     bool anycast)
2038 {
2039         struct net *net = dev_net(idev->dev);
2040         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2041         int err;
2042
2043         if (!rt) {
2044                 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2045                 return ERR_PTR(-ENOMEM);
2046         }
2047
2048         in6_dev_hold(idev);
2049
2050         rt->dst.flags |= DST_HOST;
2051         rt->dst.input = ip6_input;
2052         rt->dst.output = ip6_output;
2053         rt->rt6i_idev = idev;
2054         rt->dst.obsolete = -1;
2055
2056         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2057         if (anycast)
2058                 rt->rt6i_flags |= RTF_ANYCAST;
2059         else
2060                 rt->rt6i_flags |= RTF_LOCAL;
2061         err = rt6_bind_neighbour(rt, rt->dst.dev);
2062         if (err) {
2063                 dst_free(&rt->dst);
2064                 return ERR_PTR(err);
2065         }
2066
2067         rt->rt6i_dst.addr = *addr;
2068         rt->rt6i_dst.plen = 128;
2069         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2070
2071         atomic_set(&rt->dst.__refcnt, 1);
2072
2073         return rt;
2074 }
2075
2076 int ip6_route_get_saddr(struct net *net,
2077                         struct rt6_info *rt,
2078                         const struct in6_addr *daddr,
2079                         unsigned int prefs,
2080                         struct in6_addr *saddr)
2081 {
2082         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2083         int err = 0;
2084         if (rt->rt6i_prefsrc.plen)
2085                 *saddr = rt->rt6i_prefsrc.addr;
2086         else
2087                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2088                                          daddr, prefs, saddr);
2089         return err;
2090 }
2091
2092 /* remove deleted ip from prefsrc entries */
2093 struct arg_dev_net_ip {
2094         struct net_device *dev;
2095         struct net *net;
2096         struct in6_addr *addr;
2097 };
2098
2099 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2100 {
2101         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2102         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2103         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2104
2105         if (((void *)rt->dst.dev == dev || !dev) &&
2106             rt != net->ipv6.ip6_null_entry &&
2107             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2108                 /* remove prefsrc entry */
2109                 rt->rt6i_prefsrc.plen = 0;
2110         }
2111         return 0;
2112 }
2113
2114 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2115 {
2116         struct net *net = dev_net(ifp->idev->dev);
2117         struct arg_dev_net_ip adni = {
2118                 .dev = ifp->idev->dev,
2119                 .net = net,
2120                 .addr = &ifp->addr,
2121         };
2122         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2123 }
2124
2125 struct arg_dev_net {
2126         struct net_device *dev;
2127         struct net *net;
2128 };
2129
2130 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2131 {
2132         const struct arg_dev_net *adn = arg;
2133         const struct net_device *dev = adn->dev;
2134
2135         if ((rt->dst.dev == dev || !dev) &&
2136             rt != adn->net->ipv6.ip6_null_entry)
2137                 return -1;
2138
2139         return 0;
2140 }
2141
2142 void rt6_ifdown(struct net *net, struct net_device *dev)
2143 {
2144         struct arg_dev_net adn = {
2145                 .dev = dev,
2146                 .net = net,
2147         };
2148
2149         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2150         icmp6_clean_all(fib6_ifdown, &adn);
2151 }
2152
2153 struct rt6_mtu_change_arg {
2154         struct net_device *dev;
2155         unsigned int mtu;
2156 };
2157
2158 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2159 {
2160         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2161         struct inet6_dev *idev;
2162
2163         /* In IPv6 pmtu discovery is not optional,
2164            so that RTAX_MTU lock cannot disable it.
2165            We still use this lock to block changes
2166            caused by addrconf/ndisc.
2167         */
2168
2169         idev = __in6_dev_get(arg->dev);
2170         if (!idev)
2171                 return 0;
2172
2173         /* For administrative MTU increase, there is no way to discover
2174            IPv6 PMTU increase, so PMTU increase should be updated here.
2175            Since RFC 1981 doesn't include administrative MTU increase
2176            update PMTU increase is a MUST. (i.e. jumbo frame)
2177          */
2178         /*
2179            If new MTU is less than route PMTU, this new MTU will be the
2180            lowest MTU in the path, update the route PMTU to reflect PMTU
2181            decreases; if new MTU is greater than route PMTU, and the
2182            old MTU is the lowest MTU in the path, update the route PMTU
2183            to reflect the increase. In this case if the other nodes' MTU
2184            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2185            PMTU discouvery.
2186          */
2187         if (rt->dst.dev == arg->dev &&
2188             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2189             (dst_mtu(&rt->dst) >= arg->mtu ||
2190              (dst_mtu(&rt->dst) < arg->mtu &&
2191               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2192                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2193         }
2194         return 0;
2195 }
2196
2197 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2198 {
2199         struct rt6_mtu_change_arg arg = {
2200                 .dev = dev,
2201                 .mtu = mtu,
2202         };
2203
2204         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2205 }
2206
2207 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2208         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2209         [RTA_OIF]               = { .type = NLA_U32 },
2210         [RTA_IIF]               = { .type = NLA_U32 },
2211         [RTA_PRIORITY]          = { .type = NLA_U32 },
2212         [RTA_METRICS]           = { .type = NLA_NESTED },
2213 };
2214
2215 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2216                               struct fib6_config *cfg)
2217 {
2218         struct rtmsg *rtm;
2219         struct nlattr *tb[RTA_MAX+1];
2220         int err;
2221
2222         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2223         if (err < 0)
2224                 goto errout;
2225
2226         err = -EINVAL;
2227         rtm = nlmsg_data(nlh);
2228         memset(cfg, 0, sizeof(*cfg));
2229
2230         cfg->fc_table = rtm->rtm_table;
2231         cfg->fc_dst_len = rtm->rtm_dst_len;
2232         cfg->fc_src_len = rtm->rtm_src_len;
2233         cfg->fc_flags = RTF_UP;
2234         cfg->fc_protocol = rtm->rtm_protocol;
2235
2236         if (rtm->rtm_type == RTN_UNREACHABLE)
2237                 cfg->fc_flags |= RTF_REJECT;
2238
2239         if (rtm->rtm_type == RTN_LOCAL)
2240                 cfg->fc_flags |= RTF_LOCAL;
2241
2242         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2243         cfg->fc_nlinfo.nlh = nlh;
2244         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2245
2246         if (tb[RTA_GATEWAY]) {
2247                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2248                 cfg->fc_flags |= RTF_GATEWAY;
2249         }
2250
2251         if (tb[RTA_DST]) {
2252                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2253
2254                 if (nla_len(tb[RTA_DST]) < plen)
2255                         goto errout;
2256
2257                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2258         }
2259
2260         if (tb[RTA_SRC]) {
2261                 int plen = (rtm->rtm_src_len + 7) >> 3;
2262
2263                 if (nla_len(tb[RTA_SRC]) < plen)
2264                         goto errout;
2265
2266                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2267         }
2268
2269         if (tb[RTA_PREFSRC])
2270                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2271
2272         if (tb[RTA_OIF])
2273                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2274
2275         if (tb[RTA_PRIORITY])
2276                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2277
2278         if (tb[RTA_METRICS]) {
2279                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2280                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2281         }
2282
2283         if (tb[RTA_TABLE])
2284                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2285
2286         err = 0;
2287 errout:
2288         return err;
2289 }
2290
2291 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2292 {
2293         struct fib6_config cfg;
2294         int err;
2295
2296         err = rtm_to_fib6_config(skb, nlh, &cfg);
2297         if (err < 0)
2298                 return err;
2299
2300         return ip6_route_del(&cfg);
2301 }
2302
2303 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2304 {
2305         struct fib6_config cfg;
2306         int err;
2307
2308         err = rtm_to_fib6_config(skb, nlh, &cfg);
2309         if (err < 0)
2310                 return err;
2311
2312         return ip6_route_add(&cfg);
2313 }
2314
2315 static inline size_t rt6_nlmsg_size(void)
2316 {
2317         return NLMSG_ALIGN(sizeof(struct rtmsg))
2318                + nla_total_size(16) /* RTA_SRC */
2319                + nla_total_size(16) /* RTA_DST */
2320                + nla_total_size(16) /* RTA_GATEWAY */
2321                + nla_total_size(16) /* RTA_PREFSRC */
2322                + nla_total_size(4) /* RTA_TABLE */
2323                + nla_total_size(4) /* RTA_IIF */
2324                + nla_total_size(4) /* RTA_OIF */
2325                + nla_total_size(4) /* RTA_PRIORITY */
2326                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2327                + nla_total_size(sizeof(struct rta_cacheinfo));
2328 }
2329
2330 static int rt6_fill_node(struct net *net,
2331                          struct sk_buff *skb, struct rt6_info *rt,
2332                          struct in6_addr *dst, struct in6_addr *src,
2333                          int iif, int type, u32 pid, u32 seq,
2334                          int prefix, int nowait, unsigned int flags)
2335 {
2336         const struct inet_peer *peer;
2337         struct rtmsg *rtm;
2338         struct nlmsghdr *nlh;
2339         long expires;
2340         u32 table;
2341         struct neighbour *n;
2342         u32 ts, tsage;
2343
2344         if (prefix) {   /* user wants prefix routes only */
2345                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2346                         /* success since this is not a prefix route */
2347                         return 1;
2348                 }
2349         }
2350
2351         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2352         if (!nlh)
2353                 return -EMSGSIZE;
2354
2355         rtm = nlmsg_data(nlh);
2356         rtm->rtm_family = AF_INET6;
2357         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2358         rtm->rtm_src_len = rt->rt6i_src.plen;
2359         rtm->rtm_tos = 0;
2360         if (rt->rt6i_table)
2361                 table = rt->rt6i_table->tb6_id;
2362         else
2363                 table = RT6_TABLE_UNSPEC;
2364         rtm->rtm_table = table;
2365         if (nla_put_u32(skb, RTA_TABLE, table))
2366                 goto nla_put_failure;
2367         if (rt->rt6i_flags & RTF_REJECT)
2368                 rtm->rtm_type = RTN_UNREACHABLE;
2369         else if (rt->rt6i_flags & RTF_LOCAL)
2370                 rtm->rtm_type = RTN_LOCAL;
2371         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2372                 rtm->rtm_type = RTN_LOCAL;
2373         else
2374                 rtm->rtm_type = RTN_UNICAST;
2375         rtm->rtm_flags = 0;
2376         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2377         rtm->rtm_protocol = rt->rt6i_protocol;
2378         if (rt->rt6i_flags & RTF_DYNAMIC)
2379                 rtm->rtm_protocol = RTPROT_REDIRECT;
2380         else if (rt->rt6i_flags & RTF_ADDRCONF)
2381                 rtm->rtm_protocol = RTPROT_KERNEL;
2382         else if (rt->rt6i_flags & RTF_DEFAULT)
2383                 rtm->rtm_protocol = RTPROT_RA;
2384
2385         if (rt->rt6i_flags & RTF_CACHE)
2386                 rtm->rtm_flags |= RTM_F_CLONED;
2387
2388         if (dst) {
2389                 if (nla_put(skb, RTA_DST, 16, dst))
2390                         goto nla_put_failure;
2391                 rtm->rtm_dst_len = 128;
2392         } else if (rtm->rtm_dst_len)
2393                 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2394                         goto nla_put_failure;
2395 #ifdef CONFIG_IPV6_SUBTREES
2396         if (src) {
2397                 if (nla_put(skb, RTA_SRC, 16, src))
2398                         goto nla_put_failure;
2399                 rtm->rtm_src_len = 128;
2400         } else if (rtm->rtm_src_len &&
2401                    nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2402                 goto nla_put_failure;
2403 #endif
2404         if (iif) {
2405 #ifdef CONFIG_IPV6_MROUTE
2406                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2407                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2408                         if (err <= 0) {
2409                                 if (!nowait) {
2410                                         if (err == 0)
2411                                                 return 0;
2412                                         goto nla_put_failure;
2413                                 } else {
2414                                         if (err == -EMSGSIZE)
2415                                                 goto nla_put_failure;
2416                                 }
2417                         }
2418                 } else
2419 #endif
2420                         if (nla_put_u32(skb, RTA_IIF, iif))
2421                                 goto nla_put_failure;
2422         } else if (dst) {
2423                 struct in6_addr saddr_buf;
2424                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2425                     nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2426                         goto nla_put_failure;
2427         }
2428
2429         if (rt->rt6i_prefsrc.plen) {
2430                 struct in6_addr saddr_buf;
2431                 saddr_buf = rt->rt6i_prefsrc.addr;
2432                 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2433                         goto nla_put_failure;
2434         }
2435
2436         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2437                 goto nla_put_failure;
2438
2439         rcu_read_lock();
2440         n = dst_get_neighbour_noref(&rt->dst);
2441         if (n) {
2442                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2443                         rcu_read_unlock();
2444                         goto nla_put_failure;
2445                 }
2446         }
2447         rcu_read_unlock();
2448
2449         if (rt->dst.dev &&
2450             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2451                 goto nla_put_failure;
2452         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2453                 goto nla_put_failure;
2454         if (!(rt->rt6i_flags & RTF_EXPIRES))
2455                 expires = 0;
2456         else if (rt->dst.expires - jiffies < INT_MAX)
2457                 expires = rt->dst.expires - jiffies;
2458         else
2459                 expires = INT_MAX;
2460
2461         peer = NULL;
2462         if (rt6_has_peer(rt))
2463                 peer = rt6_peer_ptr(rt);
2464         ts = tsage = 0;
2465         if (peer && peer->tcp_ts_stamp) {
2466                 ts = peer->tcp_ts;
2467                 tsage = get_seconds() - peer->tcp_ts_stamp;
2468         }
2469
2470         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2471                                expires, rt->dst.error) < 0)
2472                 goto nla_put_failure;
2473
2474         return nlmsg_end(skb, nlh);
2475
2476 nla_put_failure:
2477         nlmsg_cancel(skb, nlh);
2478         return -EMSGSIZE;
2479 }
2480
2481 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2482 {
2483         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2484         int prefix;
2485
2486         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2487                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2488                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2489         } else
2490                 prefix = 0;
2491
2492         return rt6_fill_node(arg->net,
2493                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2494                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2495                      prefix, 0, NLM_F_MULTI);
2496 }
2497
2498 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2499 {
2500         struct net *net = sock_net(in_skb->sk);
2501         struct nlattr *tb[RTA_MAX+1];
2502         struct rt6_info *rt;
2503         struct sk_buff *skb;
2504         struct rtmsg *rtm;
2505         struct flowi6 fl6;
2506         int err, iif = 0, oif = 0;
2507
2508         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2509         if (err < 0)
2510                 goto errout;
2511
2512         err = -EINVAL;
2513         memset(&fl6, 0, sizeof(fl6));
2514
2515         if (tb[RTA_SRC]) {
2516                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2517                         goto errout;
2518
2519                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2520         }
2521
2522         if (tb[RTA_DST]) {
2523                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2524                         goto errout;
2525
2526                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2527         }
2528
2529         if (tb[RTA_IIF])
2530                 iif = nla_get_u32(tb[RTA_IIF]);
2531
2532         if (tb[RTA_OIF])
2533                 oif = nla_get_u32(tb[RTA_OIF]);
2534
2535         if (iif) {
2536                 struct net_device *dev;
2537                 int flags = 0;
2538
2539                 dev = __dev_get_by_index(net, iif);
2540                 if (!dev) {
2541                         err = -ENODEV;
2542                         goto errout;
2543                 }
2544
2545                 fl6.flowi6_iif = iif;
2546
2547                 if (!ipv6_addr_any(&fl6.saddr))
2548                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2549
2550                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2551                                                                flags);
2552         } else {
2553                 fl6.flowi6_oif = oif;
2554
2555                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2556         }
2557
2558         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2559         if (!skb) {
2560                 dst_release(&rt->dst);
2561                 err = -ENOBUFS;
2562                 goto errout;
2563         }
2564
2565         /* Reserve room for dummy headers, this skb can pass
2566            through good chunk of routing engine.
2567          */
2568         skb_reset_mac_header(skb);
2569         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2570
2571         skb_dst_set(skb, &rt->dst);
2572
2573         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2574                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2575                             nlh->nlmsg_seq, 0, 0, 0);
2576         if (err < 0) {
2577                 kfree_skb(skb);
2578                 goto errout;
2579         }
2580
2581         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2582 errout:
2583         return err;
2584 }
2585
2586 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2587 {
2588         struct sk_buff *skb;
2589         struct net *net = info->nl_net;
2590         u32 seq;
2591         int err;
2592
2593         err = -ENOBUFS;
2594         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2595
2596         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2597         if (!skb)
2598                 goto errout;
2599
2600         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2601                                 event, info->pid, seq, 0, 0, 0);
2602         if (err < 0) {
2603                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2604                 WARN_ON(err == -EMSGSIZE);
2605                 kfree_skb(skb);
2606                 goto errout;
2607         }
2608         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2609                     info->nlh, gfp_any());
2610         return;
2611 errout:
2612         if (err < 0)
2613                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2614 }
2615
2616 static int ip6_route_dev_notify(struct notifier_block *this,
2617                                 unsigned long event, void *data)
2618 {
2619         struct net_device *dev = (struct net_device *)data;
2620         struct net *net = dev_net(dev);
2621
2622         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2623                 net->ipv6.ip6_null_entry->dst.dev = dev;
2624                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2625 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2626                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2627                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2628                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2629                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2630 #endif
2631         }
2632
2633         return NOTIFY_OK;
2634 }
2635
2636 /*
2637  *      /proc
2638  */
2639
2640 #ifdef CONFIG_PROC_FS
2641
2642 struct rt6_proc_arg
2643 {
2644         char *buffer;
2645         int offset;
2646         int length;
2647         int skip;
2648         int len;
2649 };
2650
2651 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2652 {
2653         struct seq_file *m = p_arg;
2654         struct neighbour *n;
2655
2656         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2657
2658 #ifdef CONFIG_IPV6_SUBTREES
2659         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2660 #else
2661         seq_puts(m, "00000000000000000000000000000000 00 ");
2662 #endif
2663         rcu_read_lock();
2664         n = dst_get_neighbour_noref(&rt->dst);
2665         if (n) {
2666                 seq_printf(m, "%pi6", n->primary_key);
2667         } else {
2668                 seq_puts(m, "00000000000000000000000000000000");
2669         }
2670         rcu_read_unlock();
2671         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2672                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2673                    rt->dst.__use, rt->rt6i_flags,
2674                    rt->dst.dev ? rt->dst.dev->name : "");
2675         return 0;
2676 }
2677
2678 static int ipv6_route_show(struct seq_file *m, void *v)
2679 {
2680         struct net *net = (struct net *)m->private;
2681         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2682         return 0;
2683 }
2684
2685 static int ipv6_route_open(struct inode *inode, struct file *file)
2686 {
2687         return single_open_net(inode, file, ipv6_route_show);
2688 }
2689
2690 static const struct file_operations ipv6_route_proc_fops = {
2691         .owner          = THIS_MODULE,
2692         .open           = ipv6_route_open,
2693         .read           = seq_read,
2694         .llseek         = seq_lseek,
2695         .release        = single_release_net,
2696 };
2697
2698 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2699 {
2700         struct net *net = (struct net *)seq->private;
2701         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2702                    net->ipv6.rt6_stats->fib_nodes,
2703                    net->ipv6.rt6_stats->fib_route_nodes,
2704                    net->ipv6.rt6_stats->fib_rt_alloc,
2705                    net->ipv6.rt6_stats->fib_rt_entries,
2706                    net->ipv6.rt6_stats->fib_rt_cache,
2707                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2708                    net->ipv6.rt6_stats->fib_discarded_routes);
2709
2710         return 0;
2711 }
2712
2713 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2714 {
2715         return single_open_net(inode, file, rt6_stats_seq_show);
2716 }
2717
2718 static const struct file_operations rt6_stats_seq_fops = {
2719         .owner   = THIS_MODULE,
2720         .open    = rt6_stats_seq_open,
2721         .read    = seq_read,
2722         .llseek  = seq_lseek,
2723         .release = single_release_net,
2724 };
2725 #endif  /* CONFIG_PROC_FS */
2726
2727 #ifdef CONFIG_SYSCTL
2728
2729 static
2730 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2731                               void __user *buffer, size_t *lenp, loff_t *ppos)
2732 {
2733         struct net *net;
2734         int delay;
2735         if (!write)
2736                 return -EINVAL;
2737
2738         net = (struct net *)ctl->extra1;
2739         delay = net->ipv6.sysctl.flush_delay;
2740         proc_dointvec(ctl, write, buffer, lenp, ppos);
2741         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2742         return 0;
2743 }
2744
2745 ctl_table ipv6_route_table_template[] = {
2746         {
2747                 .procname       =       "flush",
2748                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2749                 .maxlen         =       sizeof(int),
2750                 .mode           =       0200,
2751                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2752         },
2753         {
2754                 .procname       =       "gc_thresh",
2755                 .data           =       &ip6_dst_ops_template.gc_thresh,
2756                 .maxlen         =       sizeof(int),
2757                 .mode           =       0644,
2758                 .proc_handler   =       proc_dointvec,
2759         },
2760         {
2761                 .procname       =       "max_size",
2762                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2763                 .maxlen         =       sizeof(int),
2764                 .mode           =       0644,
2765                 .proc_handler   =       proc_dointvec,
2766         },
2767         {
2768                 .procname       =       "gc_min_interval",
2769                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2770                 .maxlen         =       sizeof(int),
2771                 .mode           =       0644,
2772                 .proc_handler   =       proc_dointvec_jiffies,
2773         },
2774         {
2775                 .procname       =       "gc_timeout",
2776                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2777                 .maxlen         =       sizeof(int),
2778                 .mode           =       0644,
2779                 .proc_handler   =       proc_dointvec_jiffies,
2780         },
2781         {
2782                 .procname       =       "gc_interval",
2783                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2784                 .maxlen         =       sizeof(int),
2785                 .mode           =       0644,
2786                 .proc_handler   =       proc_dointvec_jiffies,
2787         },
2788         {
2789                 .procname       =       "gc_elasticity",
2790                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2791                 .maxlen         =       sizeof(int),
2792                 .mode           =       0644,
2793                 .proc_handler   =       proc_dointvec,
2794         },
2795         {
2796                 .procname       =       "mtu_expires",
2797                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2798                 .maxlen         =       sizeof(int),
2799                 .mode           =       0644,
2800                 .proc_handler   =       proc_dointvec_jiffies,
2801         },
2802         {
2803                 .procname       =       "min_adv_mss",
2804                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2805                 .maxlen         =       sizeof(int),
2806                 .mode           =       0644,
2807                 .proc_handler   =       proc_dointvec,
2808         },
2809         {
2810                 .procname       =       "gc_min_interval_ms",
2811                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2812                 .maxlen         =       sizeof(int),
2813                 .mode           =       0644,
2814                 .proc_handler   =       proc_dointvec_ms_jiffies,
2815         },
2816         { }
2817 };
2818
2819 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2820 {
2821         struct ctl_table *table;
2822
2823         table = kmemdup(ipv6_route_table_template,
2824                         sizeof(ipv6_route_table_template),
2825                         GFP_KERNEL);
2826
2827         if (table) {
2828                 table[0].data = &net->ipv6.sysctl.flush_delay;
2829                 table[0].extra1 = net;
2830                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2831                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2832                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2833                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2834                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2835                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2836                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2837                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2838                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2839         }
2840
2841         return table;
2842 }
2843 #endif
2844
2845 static int __net_init ip6_route_net_init(struct net *net)
2846 {
2847         int ret = -ENOMEM;
2848
2849         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2850                sizeof(net->ipv6.ip6_dst_ops));
2851
2852         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2853                 goto out_ip6_dst_ops;
2854
2855         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2856                                            sizeof(*net->ipv6.ip6_null_entry),
2857                                            GFP_KERNEL);
2858         if (!net->ipv6.ip6_null_entry)
2859                 goto out_ip6_dst_entries;
2860         net->ipv6.ip6_null_entry->dst.path =
2861                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2862         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2863         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2864                          ip6_template_metrics, true);
2865
2866 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2867         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2868                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2869                                                GFP_KERNEL);
2870         if (!net->ipv6.ip6_prohibit_entry)
2871                 goto out_ip6_null_entry;
2872         net->ipv6.ip6_prohibit_entry->dst.path =
2873                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2874         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2875         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2876                          ip6_template_metrics, true);
2877
2878         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2879                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2880                                                GFP_KERNEL);
2881         if (!net->ipv6.ip6_blk_hole_entry)
2882                 goto out_ip6_prohibit_entry;
2883         net->ipv6.ip6_blk_hole_entry->dst.path =
2884                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2885         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2886         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2887                          ip6_template_metrics, true);
2888 #endif
2889
2890         net->ipv6.sysctl.flush_delay = 0;
2891         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2892         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2893         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2894         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2895         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2896         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2897         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2898
2899         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2900
2901         ret = 0;
2902 out:
2903         return ret;
2904
2905 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2906 out_ip6_prohibit_entry:
2907         kfree(net->ipv6.ip6_prohibit_entry);
2908 out_ip6_null_entry:
2909         kfree(net->ipv6.ip6_null_entry);
2910 #endif
2911 out_ip6_dst_entries:
2912         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2913 out_ip6_dst_ops:
2914         goto out;
2915 }
2916
2917 static void __net_exit ip6_route_net_exit(struct net *net)
2918 {
2919         kfree(net->ipv6.ip6_null_entry);
2920 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2921         kfree(net->ipv6.ip6_prohibit_entry);
2922         kfree(net->ipv6.ip6_blk_hole_entry);
2923 #endif
2924         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2925 }
2926
2927 static int __net_init ip6_route_net_init_late(struct net *net)
2928 {
2929 #ifdef CONFIG_PROC_FS
2930         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2931         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2932 #endif
2933         return 0;
2934 }
2935
2936 static void __net_exit ip6_route_net_exit_late(struct net *net)
2937 {
2938 #ifdef CONFIG_PROC_FS
2939         proc_net_remove(net, "ipv6_route");
2940         proc_net_remove(net, "rt6_stats");
2941 #endif
2942 }
2943
2944 static struct pernet_operations ip6_route_net_ops = {
2945         .init = ip6_route_net_init,
2946         .exit = ip6_route_net_exit,
2947 };
2948
2949 static int __net_init ipv6_inetpeer_init(struct net *net)
2950 {
2951         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2952
2953         if (!bp)
2954                 return -ENOMEM;
2955         inet_peer_base_init(bp);
2956         net->ipv6.peers = bp;
2957         return 0;
2958 }
2959
2960 static void __net_exit ipv6_inetpeer_exit(struct net *net)
2961 {
2962         struct inet_peer_base *bp = net->ipv6.peers;
2963
2964         net->ipv6.peers = NULL;
2965         inetpeer_invalidate_tree(bp);
2966         kfree(bp);
2967 }
2968
2969 static struct pernet_operations ipv6_inetpeer_ops = {
2970         .init   =       ipv6_inetpeer_init,
2971         .exit   =       ipv6_inetpeer_exit,
2972 };
2973
2974 static struct pernet_operations ip6_route_net_late_ops = {
2975         .init = ip6_route_net_init_late,
2976         .exit = ip6_route_net_exit_late,
2977 };
2978
2979 static struct notifier_block ip6_route_dev_notifier = {
2980         .notifier_call = ip6_route_dev_notify,
2981         .priority = 0,
2982 };
2983
2984 int __init ip6_route_init(void)
2985 {
2986         int ret;
2987
2988         ret = -ENOMEM;
2989         ip6_dst_ops_template.kmem_cachep =
2990                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2991                                   SLAB_HWCACHE_ALIGN, NULL);
2992         if (!ip6_dst_ops_template.kmem_cachep)
2993                 goto out;
2994
2995         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2996         if (ret)
2997                 goto out_kmem_cache;
2998
2999         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3000         if (ret)
3001                 goto out_dst_entries;
3002
3003         ret = register_pernet_subsys(&ip6_route_net_ops);
3004         if (ret)
3005                 goto out_register_inetpeer;
3006
3007         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3008
3009         /* Registering of the loopback is done before this portion of code,
3010          * the loopback reference in rt6_info will not be taken, do it
3011          * manually for init_net */
3012         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3013         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3014   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3015         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3016         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3017         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3018         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3019   #endif
3020         ret = fib6_init();
3021         if (ret)
3022                 goto out_register_subsys;
3023
3024         ret = xfrm6_init();
3025         if (ret)
3026                 goto out_fib6_init;
3027
3028         ret = fib6_rules_init();
3029         if (ret)
3030                 goto xfrm6_init;
3031
3032         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3033         if (ret)
3034                 goto fib6_rules_init;
3035
3036         ret = -ENOBUFS;
3037         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3038             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3039             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3040                 goto out_register_late_subsys;
3041
3042         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3043         if (ret)
3044                 goto out_register_late_subsys;
3045
3046 out:
3047         return ret;
3048
3049 out_register_late_subsys:
3050         unregister_pernet_subsys(&ip6_route_net_late_ops);
3051 fib6_rules_init:
3052         fib6_rules_cleanup();
3053 xfrm6_init:
3054         xfrm6_fini();
3055 out_fib6_init:
3056         fib6_gc_cleanup();
3057 out_register_subsys:
3058         unregister_pernet_subsys(&ip6_route_net_ops);
3059 out_register_inetpeer:
3060         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3061 out_dst_entries:
3062         dst_entries_destroy(&ip6_dst_blackhole_ops);
3063 out_kmem_cache:
3064         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3065         goto out;
3066 }
3067
3068 void ip6_route_cleanup(void)
3069 {
3070         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3071         unregister_pernet_subsys(&ip6_route_net_late_ops);
3072         fib6_rules_cleanup();
3073         xfrm6_fini();
3074         fib6_gc_cleanup();
3075         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3076         unregister_pernet_subsys(&ip6_route_net_ops);
3077         dst_entries_destroy(&ip6_dst_blackhole_ops);
3078         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3079 }