Merge tag 'iwlwifi-next-for-kalle-2015-08-23' of https://git.kernel.org/pub/scm/linux...
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/vrf.h>
116
117 #define RT_FL_TOS(oldflp4) \
118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly       = 256;
131
132 /*
133  *      Interface to generic destination cache.
134  */
135
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void              ipv4_link_failure(struct sk_buff *skb);
141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142                                            struct sk_buff *skb, u32 mtu);
143 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
144                                         struct sk_buff *skb);
145 static void             ipv4_dst_destroy(struct dst_entry *dst);
146
147 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
148 {
149         WARN_ON(1);
150         return NULL;
151 }
152
153 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
154                                            struct sk_buff *skb,
155                                            const void *daddr);
156
157 static struct dst_ops ipv4_dst_ops = {
158         .family =               AF_INET,
159         .check =                ipv4_dst_check,
160         .default_advmss =       ipv4_default_advmss,
161         .mtu =                  ipv4_mtu,
162         .cow_metrics =          ipv4_cow_metrics,
163         .destroy =              ipv4_dst_destroy,
164         .negative_advice =      ipv4_negative_advice,
165         .link_failure =         ipv4_link_failure,
166         .update_pmtu =          ip_rt_update_pmtu,
167         .redirect =             ip_do_redirect,
168         .local_out =            __ip_local_out,
169         .neigh_lookup =         ipv4_neigh_lookup,
170 };
171
172 #define ECN_OR_COST(class)      TC_PRIO_##class
173
174 const __u8 ip_tos2prio[16] = {
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BESTEFFORT,
178         ECN_OR_COST(BESTEFFORT),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_BULK,
182         ECN_OR_COST(BULK),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE,
186         ECN_OR_COST(INTERACTIVE),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK),
189         TC_PRIO_INTERACTIVE_BULK,
190         ECN_OR_COST(INTERACTIVE_BULK)
191 };
192 EXPORT_SYMBOL(ip_tos2prio);
193
194 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
195 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
196
197 #ifdef CONFIG_PROC_FS
198 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
199 {
200         if (*pos)
201                 return NULL;
202         return SEQ_START_TOKEN;
203 }
204
205 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
206 {
207         ++*pos;
208         return NULL;
209 }
210
211 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
212 {
213 }
214
215 static int rt_cache_seq_show(struct seq_file *seq, void *v)
216 {
217         if (v == SEQ_START_TOKEN)
218                 seq_printf(seq, "%-127s\n",
219                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
220                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
221                            "HHUptod\tSpecDst");
222         return 0;
223 }
224
225 static const struct seq_operations rt_cache_seq_ops = {
226         .start  = rt_cache_seq_start,
227         .next   = rt_cache_seq_next,
228         .stop   = rt_cache_seq_stop,
229         .show   = rt_cache_seq_show,
230 };
231
232 static int rt_cache_seq_open(struct inode *inode, struct file *file)
233 {
234         return seq_open(file, &rt_cache_seq_ops);
235 }
236
237 static const struct file_operations rt_cache_seq_fops = {
238         .owner   = THIS_MODULE,
239         .open    = rt_cache_seq_open,
240         .read    = seq_read,
241         .llseek  = seq_lseek,
242         .release = seq_release,
243 };
244
245
246 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
247 {
248         int cpu;
249
250         if (*pos == 0)
251                 return SEQ_START_TOKEN;
252
253         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
254                 if (!cpu_possible(cpu))
255                         continue;
256                 *pos = cpu+1;
257                 return &per_cpu(rt_cache_stat, cpu);
258         }
259         return NULL;
260 }
261
262 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
263 {
264         int cpu;
265
266         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
267                 if (!cpu_possible(cpu))
268                         continue;
269                 *pos = cpu+1;
270                 return &per_cpu(rt_cache_stat, cpu);
271         }
272         return NULL;
273
274 }
275
276 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
277 {
278
279 }
280
281 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
282 {
283         struct rt_cache_stat *st = v;
284
285         if (v == SEQ_START_TOKEN) {
286                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
287                 return 0;
288         }
289
290         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
291                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
292                    dst_entries_get_slow(&ipv4_dst_ops),
293                    0, /* st->in_hit */
294                    st->in_slow_tot,
295                    st->in_slow_mc,
296                    st->in_no_route,
297                    st->in_brd,
298                    st->in_martian_dst,
299                    st->in_martian_src,
300
301                    0, /* st->out_hit */
302                    st->out_slow_tot,
303                    st->out_slow_mc,
304
305                    0, /* st->gc_total */
306                    0, /* st->gc_ignored */
307                    0, /* st->gc_goal_miss */
308                    0, /* st->gc_dst_overflow */
309                    0, /* st->in_hlist_search */
310                    0  /* st->out_hlist_search */
311                 );
312         return 0;
313 }
314
315 static const struct seq_operations rt_cpu_seq_ops = {
316         .start  = rt_cpu_seq_start,
317         .next   = rt_cpu_seq_next,
318         .stop   = rt_cpu_seq_stop,
319         .show   = rt_cpu_seq_show,
320 };
321
322
323 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
324 {
325         return seq_open(file, &rt_cpu_seq_ops);
326 }
327
328 static const struct file_operations rt_cpu_seq_fops = {
329         .owner   = THIS_MODULE,
330         .open    = rt_cpu_seq_open,
331         .read    = seq_read,
332         .llseek  = seq_lseek,
333         .release = seq_release,
334 };
335
336 #ifdef CONFIG_IP_ROUTE_CLASSID
337 static int rt_acct_proc_show(struct seq_file *m, void *v)
338 {
339         struct ip_rt_acct *dst, *src;
340         unsigned int i, j;
341
342         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
343         if (!dst)
344                 return -ENOMEM;
345
346         for_each_possible_cpu(i) {
347                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
348                 for (j = 0; j < 256; j++) {
349                         dst[j].o_bytes   += src[j].o_bytes;
350                         dst[j].o_packets += src[j].o_packets;
351                         dst[j].i_bytes   += src[j].i_bytes;
352                         dst[j].i_packets += src[j].i_packets;
353                 }
354         }
355
356         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
357         kfree(dst);
358         return 0;
359 }
360
361 static int rt_acct_proc_open(struct inode *inode, struct file *file)
362 {
363         return single_open(file, rt_acct_proc_show, NULL);
364 }
365
366 static const struct file_operations rt_acct_proc_fops = {
367         .owner          = THIS_MODULE,
368         .open           = rt_acct_proc_open,
369         .read           = seq_read,
370         .llseek         = seq_lseek,
371         .release        = single_release,
372 };
373 #endif
374
375 static int __net_init ip_rt_do_proc_init(struct net *net)
376 {
377         struct proc_dir_entry *pde;
378
379         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
380                           &rt_cache_seq_fops);
381         if (!pde)
382                 goto err1;
383
384         pde = proc_create("rt_cache", S_IRUGO,
385                           net->proc_net_stat, &rt_cpu_seq_fops);
386         if (!pde)
387                 goto err2;
388
389 #ifdef CONFIG_IP_ROUTE_CLASSID
390         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
391         if (!pde)
392                 goto err3;
393 #endif
394         return 0;
395
396 #ifdef CONFIG_IP_ROUTE_CLASSID
397 err3:
398         remove_proc_entry("rt_cache", net->proc_net_stat);
399 #endif
400 err2:
401         remove_proc_entry("rt_cache", net->proc_net);
402 err1:
403         return -ENOMEM;
404 }
405
406 static void __net_exit ip_rt_do_proc_exit(struct net *net)
407 {
408         remove_proc_entry("rt_cache", net->proc_net_stat);
409         remove_proc_entry("rt_cache", net->proc_net);
410 #ifdef CONFIG_IP_ROUTE_CLASSID
411         remove_proc_entry("rt_acct", net->proc_net);
412 #endif
413 }
414
415 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
416         .init = ip_rt_do_proc_init,
417         .exit = ip_rt_do_proc_exit,
418 };
419
420 static int __init ip_rt_proc_init(void)
421 {
422         return register_pernet_subsys(&ip_rt_proc_ops);
423 }
424
425 #else
426 static inline int ip_rt_proc_init(void)
427 {
428         return 0;
429 }
430 #endif /* CONFIG_PROC_FS */
431
432 static inline bool rt_is_expired(const struct rtable *rth)
433 {
434         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
435 }
436
437 void rt_cache_flush(struct net *net)
438 {
439         rt_genid_bump_ipv4(net);
440 }
441
442 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
443                                            struct sk_buff *skb,
444                                            const void *daddr)
445 {
446         struct net_device *dev = dst->dev;
447         const __be32 *pkey = daddr;
448         const struct rtable *rt;
449         struct neighbour *n;
450
451         rt = (const struct rtable *) dst;
452         if (rt->rt_gateway)
453                 pkey = (const __be32 *) &rt->rt_gateway;
454         else if (skb)
455                 pkey = &ip_hdr(skb)->daddr;
456
457         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
458         if (n)
459                 return n;
460         return neigh_create(&arp_tbl, pkey, dev);
461 }
462
463 #define IP_IDENTS_SZ 2048u
464
465 static atomic_t *ip_idents __read_mostly;
466 static u32 *ip_tstamps __read_mostly;
467
468 /* In order to protect privacy, we add a perturbation to identifiers
469  * if one generator is seldom used. This makes hard for an attacker
470  * to infer how many packets were sent between two points in time.
471  */
472 u32 ip_idents_reserve(u32 hash, int segs)
473 {
474         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
475         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
476         u32 old = ACCESS_ONCE(*p_tstamp);
477         u32 now = (u32)jiffies;
478         u32 delta = 0;
479
480         if (old != now && cmpxchg(p_tstamp, old, now) == old)
481                 delta = prandom_u32_max(now - old);
482
483         return atomic_add_return(segs + delta, p_id) - segs;
484 }
485 EXPORT_SYMBOL(ip_idents_reserve);
486
487 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
488 {
489         static u32 ip_idents_hashrnd __read_mostly;
490         u32 hash, id;
491
492         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
493
494         hash = jhash_3words((__force u32)iph->daddr,
495                             (__force u32)iph->saddr,
496                             iph->protocol ^ net_hash_mix(net),
497                             ip_idents_hashrnd);
498         id = ip_idents_reserve(hash, segs);
499         iph->id = htons(id);
500 }
501 EXPORT_SYMBOL(__ip_select_ident);
502
503 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
504                              const struct iphdr *iph,
505                              int oif, u8 tos,
506                              u8 prot, u32 mark, int flow_flags)
507 {
508         if (sk) {
509                 const struct inet_sock *inet = inet_sk(sk);
510
511                 oif = sk->sk_bound_dev_if;
512                 mark = sk->sk_mark;
513                 tos = RT_CONN_FLAGS(sk);
514                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
515         }
516         flowi4_init_output(fl4, oif, mark, tos,
517                            RT_SCOPE_UNIVERSE, prot,
518                            flow_flags,
519                            iph->daddr, iph->saddr, 0, 0);
520 }
521
522 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
523                                const struct sock *sk)
524 {
525         const struct iphdr *iph = ip_hdr(skb);
526         int oif = skb->dev->ifindex;
527         u8 tos = RT_TOS(iph->tos);
528         u8 prot = iph->protocol;
529         u32 mark = skb->mark;
530
531         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532 }
533
534 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
535 {
536         const struct inet_sock *inet = inet_sk(sk);
537         const struct ip_options_rcu *inet_opt;
538         __be32 daddr = inet->inet_daddr;
539
540         rcu_read_lock();
541         inet_opt = rcu_dereference(inet->inet_opt);
542         if (inet_opt && inet_opt->opt.srr)
543                 daddr = inet_opt->opt.faddr;
544         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
545                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
546                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
547                            inet_sk_flowi_flags(sk),
548                            daddr, inet->inet_saddr, 0, 0);
549         rcu_read_unlock();
550 }
551
552 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
553                                  const struct sk_buff *skb)
554 {
555         if (skb)
556                 build_skb_flow_key(fl4, skb, sk);
557         else
558                 build_sk_flow_key(fl4, sk);
559 }
560
561 static inline void rt_free(struct rtable *rt)
562 {
563         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
564 }
565
566 static DEFINE_SPINLOCK(fnhe_lock);
567
568 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
569 {
570         struct rtable *rt;
571
572         rt = rcu_dereference(fnhe->fnhe_rth_input);
573         if (rt) {
574                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
575                 rt_free(rt);
576         }
577         rt = rcu_dereference(fnhe->fnhe_rth_output);
578         if (rt) {
579                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
580                 rt_free(rt);
581         }
582 }
583
584 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
585 {
586         struct fib_nh_exception *fnhe, *oldest;
587
588         oldest = rcu_dereference(hash->chain);
589         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
590              fnhe = rcu_dereference(fnhe->fnhe_next)) {
591                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
592                         oldest = fnhe;
593         }
594         fnhe_flush_routes(oldest);
595         return oldest;
596 }
597
598 static inline u32 fnhe_hashfun(__be32 daddr)
599 {
600         static u32 fnhe_hashrnd __read_mostly;
601         u32 hval;
602
603         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
604         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
605         return hash_32(hval, FNHE_HASH_SHIFT);
606 }
607
608 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
609 {
610         rt->rt_pmtu = fnhe->fnhe_pmtu;
611         rt->dst.expires = fnhe->fnhe_expires;
612
613         if (fnhe->fnhe_gw) {
614                 rt->rt_flags |= RTCF_REDIRECTED;
615                 rt->rt_gateway = fnhe->fnhe_gw;
616                 rt->rt_uses_gateway = 1;
617         }
618 }
619
620 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
621                                   u32 pmtu, unsigned long expires)
622 {
623         struct fnhe_hash_bucket *hash;
624         struct fib_nh_exception *fnhe;
625         struct rtable *rt;
626         unsigned int i;
627         int depth;
628         u32 hval = fnhe_hashfun(daddr);
629
630         spin_lock_bh(&fnhe_lock);
631
632         hash = rcu_dereference(nh->nh_exceptions);
633         if (!hash) {
634                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
635                 if (!hash)
636                         goto out_unlock;
637                 rcu_assign_pointer(nh->nh_exceptions, hash);
638         }
639
640         hash += hval;
641
642         depth = 0;
643         for (fnhe = rcu_dereference(hash->chain); fnhe;
644              fnhe = rcu_dereference(fnhe->fnhe_next)) {
645                 if (fnhe->fnhe_daddr == daddr)
646                         break;
647                 depth++;
648         }
649
650         if (fnhe) {
651                 if (gw)
652                         fnhe->fnhe_gw = gw;
653                 if (pmtu) {
654                         fnhe->fnhe_pmtu = pmtu;
655                         fnhe->fnhe_expires = max(1UL, expires);
656                 }
657                 /* Update all cached dsts too */
658                 rt = rcu_dereference(fnhe->fnhe_rth_input);
659                 if (rt)
660                         fill_route_from_fnhe(rt, fnhe);
661                 rt = rcu_dereference(fnhe->fnhe_rth_output);
662                 if (rt)
663                         fill_route_from_fnhe(rt, fnhe);
664         } else {
665                 if (depth > FNHE_RECLAIM_DEPTH)
666                         fnhe = fnhe_oldest(hash);
667                 else {
668                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
669                         if (!fnhe)
670                                 goto out_unlock;
671
672                         fnhe->fnhe_next = hash->chain;
673                         rcu_assign_pointer(hash->chain, fnhe);
674                 }
675                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
676                 fnhe->fnhe_daddr = daddr;
677                 fnhe->fnhe_gw = gw;
678                 fnhe->fnhe_pmtu = pmtu;
679                 fnhe->fnhe_expires = expires;
680
681                 /* Exception created; mark the cached routes for the nexthop
682                  * stale, so anyone caching it rechecks if this exception
683                  * applies to them.
684                  */
685                 rt = rcu_dereference(nh->nh_rth_input);
686                 if (rt)
687                         rt->dst.obsolete = DST_OBSOLETE_KILL;
688
689                 for_each_possible_cpu(i) {
690                         struct rtable __rcu **prt;
691                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
692                         rt = rcu_dereference(*prt);
693                         if (rt)
694                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
695                 }
696         }
697
698         fnhe->fnhe_stamp = jiffies;
699
700 out_unlock:
701         spin_unlock_bh(&fnhe_lock);
702 }
703
704 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
705                              bool kill_route)
706 {
707         __be32 new_gw = icmp_hdr(skb)->un.gateway;
708         __be32 old_gw = ip_hdr(skb)->saddr;
709         struct net_device *dev = skb->dev;
710         struct in_device *in_dev;
711         struct fib_result res;
712         struct neighbour *n;
713         struct net *net;
714
715         switch (icmp_hdr(skb)->code & 7) {
716         case ICMP_REDIR_NET:
717         case ICMP_REDIR_NETTOS:
718         case ICMP_REDIR_HOST:
719         case ICMP_REDIR_HOSTTOS:
720                 break;
721
722         default:
723                 return;
724         }
725
726         if (rt->rt_gateway != old_gw)
727                 return;
728
729         in_dev = __in_dev_get_rcu(dev);
730         if (!in_dev)
731                 return;
732
733         net = dev_net(dev);
734         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
735             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
736             ipv4_is_zeronet(new_gw))
737                 goto reject_redirect;
738
739         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
740                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
741                         goto reject_redirect;
742                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
743                         goto reject_redirect;
744         } else {
745                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
746                         goto reject_redirect;
747         }
748
749         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
750         if (!IS_ERR(n)) {
751                 if (!(n->nud_state & NUD_VALID)) {
752                         neigh_event_send(n, NULL);
753                 } else {
754                         if (fib_lookup(net, fl4, &res, 0) == 0) {
755                                 struct fib_nh *nh = &FIB_RES_NH(res);
756
757                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
758                                                       0, 0);
759                         }
760                         if (kill_route)
761                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
762                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
763                 }
764                 neigh_release(n);
765         }
766         return;
767
768 reject_redirect:
769 #ifdef CONFIG_IP_ROUTE_VERBOSE
770         if (IN_DEV_LOG_MARTIANS(in_dev)) {
771                 const struct iphdr *iph = (const struct iphdr *) skb->data;
772                 __be32 daddr = iph->daddr;
773                 __be32 saddr = iph->saddr;
774
775                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
776                                      "  Advised path = %pI4 -> %pI4\n",
777                                      &old_gw, dev->name, &new_gw,
778                                      &saddr, &daddr);
779         }
780 #endif
781         ;
782 }
783
784 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
785 {
786         struct rtable *rt;
787         struct flowi4 fl4;
788         const struct iphdr *iph = (const struct iphdr *) skb->data;
789         int oif = skb->dev->ifindex;
790         u8 tos = RT_TOS(iph->tos);
791         u8 prot = iph->protocol;
792         u32 mark = skb->mark;
793
794         rt = (struct rtable *) dst;
795
796         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
797         __ip_do_redirect(rt, skb, &fl4, true);
798 }
799
800 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
801 {
802         struct rtable *rt = (struct rtable *)dst;
803         struct dst_entry *ret = dst;
804
805         if (rt) {
806                 if (dst->obsolete > 0) {
807                         ip_rt_put(rt);
808                         ret = NULL;
809                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
810                            rt->dst.expires) {
811                         ip_rt_put(rt);
812                         ret = NULL;
813                 }
814         }
815         return ret;
816 }
817
818 /*
819  * Algorithm:
820  *      1. The first ip_rt_redirect_number redirects are sent
821  *         with exponential backoff, then we stop sending them at all,
822  *         assuming that the host ignores our redirects.
823  *      2. If we did not see packets requiring redirects
824  *         during ip_rt_redirect_silence, we assume that the host
825  *         forgot redirected route and start to send redirects again.
826  *
827  * This algorithm is much cheaper and more intelligent than dumb load limiting
828  * in icmp.c.
829  *
830  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
831  * and "frag. need" (breaks PMTU discovery) in icmp.c.
832  */
833
834 void ip_rt_send_redirect(struct sk_buff *skb)
835 {
836         struct rtable *rt = skb_rtable(skb);
837         struct in_device *in_dev;
838         struct inet_peer *peer;
839         struct net *net;
840         int log_martians;
841
842         rcu_read_lock();
843         in_dev = __in_dev_get_rcu(rt->dst.dev);
844         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
845                 rcu_read_unlock();
846                 return;
847         }
848         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
849         rcu_read_unlock();
850
851         net = dev_net(rt->dst.dev);
852         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
853         if (!peer) {
854                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
855                           rt_nexthop(rt, ip_hdr(skb)->daddr));
856                 return;
857         }
858
859         /* No redirected packets during ip_rt_redirect_silence;
860          * reset the algorithm.
861          */
862         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
863                 peer->rate_tokens = 0;
864
865         /* Too many ignored redirects; do not send anything
866          * set dst.rate_last to the last seen redirected packet.
867          */
868         if (peer->rate_tokens >= ip_rt_redirect_number) {
869                 peer->rate_last = jiffies;
870                 goto out_put_peer;
871         }
872
873         /* Check for load limit; set rate_last to the latest sent
874          * redirect.
875          */
876         if (peer->rate_tokens == 0 ||
877             time_after(jiffies,
878                        (peer->rate_last +
879                         (ip_rt_redirect_load << peer->rate_tokens)))) {
880                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
881
882                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
883                 peer->rate_last = jiffies;
884                 ++peer->rate_tokens;
885 #ifdef CONFIG_IP_ROUTE_VERBOSE
886                 if (log_martians &&
887                     peer->rate_tokens == ip_rt_redirect_number)
888                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
889                                              &ip_hdr(skb)->saddr, inet_iif(skb),
890                                              &ip_hdr(skb)->daddr, &gw);
891 #endif
892         }
893 out_put_peer:
894         inet_putpeer(peer);
895 }
896
897 static int ip_error(struct sk_buff *skb)
898 {
899         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
900         struct rtable *rt = skb_rtable(skb);
901         struct inet_peer *peer;
902         unsigned long now;
903         struct net *net;
904         bool send;
905         int code;
906
907         /* IP on this device is disabled. */
908         if (!in_dev)
909                 goto out;
910
911         net = dev_net(rt->dst.dev);
912         if (!IN_DEV_FORWARD(in_dev)) {
913                 switch (rt->dst.error) {
914                 case EHOSTUNREACH:
915                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
916                         break;
917
918                 case ENETUNREACH:
919                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
920                         break;
921                 }
922                 goto out;
923         }
924
925         switch (rt->dst.error) {
926         case EINVAL:
927         default:
928                 goto out;
929         case EHOSTUNREACH:
930                 code = ICMP_HOST_UNREACH;
931                 break;
932         case ENETUNREACH:
933                 code = ICMP_NET_UNREACH;
934                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
935                 break;
936         case EACCES:
937                 code = ICMP_PKT_FILTERED;
938                 break;
939         }
940
941         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
942
943         send = true;
944         if (peer) {
945                 now = jiffies;
946                 peer->rate_tokens += now - peer->rate_last;
947                 if (peer->rate_tokens > ip_rt_error_burst)
948                         peer->rate_tokens = ip_rt_error_burst;
949                 peer->rate_last = now;
950                 if (peer->rate_tokens >= ip_rt_error_cost)
951                         peer->rate_tokens -= ip_rt_error_cost;
952                 else
953                         send = false;
954                 inet_putpeer(peer);
955         }
956         if (send)
957                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
958
959 out:    kfree_skb(skb);
960         return 0;
961 }
962
963 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
964 {
965         struct dst_entry *dst = &rt->dst;
966         struct fib_result res;
967
968         if (dst_metric_locked(dst, RTAX_MTU))
969                 return;
970
971         if (ipv4_mtu(dst) < mtu)
972                 return;
973
974         if (mtu < ip_rt_min_pmtu)
975                 mtu = ip_rt_min_pmtu;
976
977         if (rt->rt_pmtu == mtu &&
978             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
979                 return;
980
981         rcu_read_lock();
982         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
983                 struct fib_nh *nh = &FIB_RES_NH(res);
984
985                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
986                                       jiffies + ip_rt_mtu_expires);
987         }
988         rcu_read_unlock();
989 }
990
991 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
992                               struct sk_buff *skb, u32 mtu)
993 {
994         struct rtable *rt = (struct rtable *) dst;
995         struct flowi4 fl4;
996
997         ip_rt_build_flow_key(&fl4, sk, skb);
998         __ip_rt_update_pmtu(rt, &fl4, mtu);
999 }
1000
1001 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1002                       int oif, u32 mark, u8 protocol, int flow_flags)
1003 {
1004         const struct iphdr *iph = (const struct iphdr *) skb->data;
1005         struct flowi4 fl4;
1006         struct rtable *rt;
1007
1008         if (!mark)
1009                 mark = IP4_REPLY_MARK(net, skb->mark);
1010
1011         __build_flow_key(&fl4, NULL, iph, oif,
1012                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1013         rt = __ip_route_output_key(net, &fl4);
1014         if (!IS_ERR(rt)) {
1015                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1016                 ip_rt_put(rt);
1017         }
1018 }
1019 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1020
1021 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1022 {
1023         const struct iphdr *iph = (const struct iphdr *) skb->data;
1024         struct flowi4 fl4;
1025         struct rtable *rt;
1026
1027         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1028
1029         if (!fl4.flowi4_mark)
1030                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1031
1032         rt = __ip_route_output_key(sock_net(sk), &fl4);
1033         if (!IS_ERR(rt)) {
1034                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1035                 ip_rt_put(rt);
1036         }
1037 }
1038
1039 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1040 {
1041         const struct iphdr *iph = (const struct iphdr *) skb->data;
1042         struct flowi4 fl4;
1043         struct rtable *rt;
1044         struct dst_entry *odst = NULL;
1045         bool new = false;
1046
1047         bh_lock_sock(sk);
1048
1049         if (!ip_sk_accept_pmtu(sk))
1050                 goto out;
1051
1052         odst = sk_dst_get(sk);
1053
1054         if (sock_owned_by_user(sk) || !odst) {
1055                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1056                 goto out;
1057         }
1058
1059         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1060
1061         rt = (struct rtable *)odst;
1062         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1063                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1064                 if (IS_ERR(rt))
1065                         goto out;
1066
1067                 new = true;
1068         }
1069
1070         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1071
1072         if (!dst_check(&rt->dst, 0)) {
1073                 if (new)
1074                         dst_release(&rt->dst);
1075
1076                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1077                 if (IS_ERR(rt))
1078                         goto out;
1079
1080                 new = true;
1081         }
1082
1083         if (new)
1084                 sk_dst_set(sk, &rt->dst);
1085
1086 out:
1087         bh_unlock_sock(sk);
1088         dst_release(odst);
1089 }
1090 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1091
1092 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1093                    int oif, u32 mark, u8 protocol, int flow_flags)
1094 {
1095         const struct iphdr *iph = (const struct iphdr *) skb->data;
1096         struct flowi4 fl4;
1097         struct rtable *rt;
1098
1099         __build_flow_key(&fl4, NULL, iph, oif,
1100                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1101         rt = __ip_route_output_key(net, &fl4);
1102         if (!IS_ERR(rt)) {
1103                 __ip_do_redirect(rt, skb, &fl4, false);
1104                 ip_rt_put(rt);
1105         }
1106 }
1107 EXPORT_SYMBOL_GPL(ipv4_redirect);
1108
1109 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1110 {
1111         const struct iphdr *iph = (const struct iphdr *) skb->data;
1112         struct flowi4 fl4;
1113         struct rtable *rt;
1114
1115         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1116         rt = __ip_route_output_key(sock_net(sk), &fl4);
1117         if (!IS_ERR(rt)) {
1118                 __ip_do_redirect(rt, skb, &fl4, false);
1119                 ip_rt_put(rt);
1120         }
1121 }
1122 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1123
1124 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1125 {
1126         struct rtable *rt = (struct rtable *) dst;
1127
1128         /* All IPV4 dsts are created with ->obsolete set to the value
1129          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1130          * into this function always.
1131          *
1132          * When a PMTU/redirect information update invalidates a route,
1133          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1134          * DST_OBSOLETE_DEAD by dst_free().
1135          */
1136         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1137                 return NULL;
1138         return dst;
1139 }
1140
1141 static void ipv4_link_failure(struct sk_buff *skb)
1142 {
1143         struct rtable *rt;
1144
1145         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1146
1147         rt = skb_rtable(skb);
1148         if (rt)
1149                 dst_set_expires(&rt->dst, 0);
1150 }
1151
1152 static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1153 {
1154         pr_debug("%s: %pI4 -> %pI4, %s\n",
1155                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1156                  skb->dev ? skb->dev->name : "?");
1157         kfree_skb(skb);
1158         WARN_ON(1);
1159         return 0;
1160 }
1161
1162 /*
1163    We do not cache source address of outgoing interface,
1164    because it is used only by IP RR, TS and SRR options,
1165    so that it out of fast path.
1166
1167    BTW remember: "addr" is allowed to be not aligned
1168    in IP options!
1169  */
1170
1171 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1172 {
1173         __be32 src;
1174
1175         if (rt_is_output_route(rt))
1176                 src = ip_hdr(skb)->saddr;
1177         else {
1178                 struct fib_result res;
1179                 struct flowi4 fl4;
1180                 struct iphdr *iph;
1181
1182                 iph = ip_hdr(skb);
1183
1184                 memset(&fl4, 0, sizeof(fl4));
1185                 fl4.daddr = iph->daddr;
1186                 fl4.saddr = iph->saddr;
1187                 fl4.flowi4_tos = RT_TOS(iph->tos);
1188                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1189                 fl4.flowi4_iif = skb->dev->ifindex;
1190                 fl4.flowi4_mark = skb->mark;
1191
1192                 rcu_read_lock();
1193                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1194                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1195                 else
1196                         src = inet_select_addr(rt->dst.dev,
1197                                                rt_nexthop(rt, iph->daddr),
1198                                                RT_SCOPE_UNIVERSE);
1199                 rcu_read_unlock();
1200         }
1201         memcpy(addr, &src, 4);
1202 }
1203
1204 #ifdef CONFIG_IP_ROUTE_CLASSID
1205 static void set_class_tag(struct rtable *rt, u32 tag)
1206 {
1207         if (!(rt->dst.tclassid & 0xFFFF))
1208                 rt->dst.tclassid |= tag & 0xFFFF;
1209         if (!(rt->dst.tclassid & 0xFFFF0000))
1210                 rt->dst.tclassid |= tag & 0xFFFF0000;
1211 }
1212 #endif
1213
1214 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1215 {
1216         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1217
1218         if (advmss == 0) {
1219                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1220                                ip_rt_min_advmss);
1221                 if (advmss > 65535 - 40)
1222                         advmss = 65535 - 40;
1223         }
1224         return advmss;
1225 }
1226
1227 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1228 {
1229         const struct rtable *rt = (const struct rtable *) dst;
1230         unsigned int mtu = rt->rt_pmtu;
1231
1232         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1233                 mtu = dst_metric_raw(dst, RTAX_MTU);
1234
1235         if (mtu)
1236                 return mtu;
1237
1238         mtu = dst->dev->mtu;
1239
1240         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1241                 if (rt->rt_uses_gateway && mtu > 576)
1242                         mtu = 576;
1243         }
1244
1245         return min_t(unsigned int, mtu, IP_MAX_MTU);
1246 }
1247
1248 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1249 {
1250         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1251         struct fib_nh_exception *fnhe;
1252         u32 hval;
1253
1254         if (!hash)
1255                 return NULL;
1256
1257         hval = fnhe_hashfun(daddr);
1258
1259         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1260              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1261                 if (fnhe->fnhe_daddr == daddr)
1262                         return fnhe;
1263         }
1264         return NULL;
1265 }
1266
1267 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1268                               __be32 daddr)
1269 {
1270         bool ret = false;
1271
1272         spin_lock_bh(&fnhe_lock);
1273
1274         if (daddr == fnhe->fnhe_daddr) {
1275                 struct rtable __rcu **porig;
1276                 struct rtable *orig;
1277                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1278
1279                 if (rt_is_input_route(rt))
1280                         porig = &fnhe->fnhe_rth_input;
1281                 else
1282                         porig = &fnhe->fnhe_rth_output;
1283                 orig = rcu_dereference(*porig);
1284
1285                 if (fnhe->fnhe_genid != genid) {
1286                         fnhe->fnhe_genid = genid;
1287                         fnhe->fnhe_gw = 0;
1288                         fnhe->fnhe_pmtu = 0;
1289                         fnhe->fnhe_expires = 0;
1290                         fnhe_flush_routes(fnhe);
1291                         orig = NULL;
1292                 }
1293                 fill_route_from_fnhe(rt, fnhe);
1294                 if (!rt->rt_gateway)
1295                         rt->rt_gateway = daddr;
1296
1297                 if (!(rt->dst.flags & DST_NOCACHE)) {
1298                         rcu_assign_pointer(*porig, rt);
1299                         if (orig)
1300                                 rt_free(orig);
1301                         ret = true;
1302                 }
1303
1304                 fnhe->fnhe_stamp = jiffies;
1305         }
1306         spin_unlock_bh(&fnhe_lock);
1307
1308         return ret;
1309 }
1310
1311 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1312 {
1313         struct rtable *orig, *prev, **p;
1314         bool ret = true;
1315
1316         if (rt_is_input_route(rt)) {
1317                 p = (struct rtable **)&nh->nh_rth_input;
1318         } else {
1319                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1320         }
1321         orig = *p;
1322
1323         prev = cmpxchg(p, orig, rt);
1324         if (prev == orig) {
1325                 if (orig)
1326                         rt_free(orig);
1327         } else
1328                 ret = false;
1329
1330         return ret;
1331 }
1332
1333 struct uncached_list {
1334         spinlock_t              lock;
1335         struct list_head        head;
1336 };
1337
1338 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1339
1340 static void rt_add_uncached_list(struct rtable *rt)
1341 {
1342         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1343
1344         rt->rt_uncached_list = ul;
1345
1346         spin_lock_bh(&ul->lock);
1347         list_add_tail(&rt->rt_uncached, &ul->head);
1348         spin_unlock_bh(&ul->lock);
1349 }
1350
1351 static void ipv4_dst_destroy(struct dst_entry *dst)
1352 {
1353         struct rtable *rt = (struct rtable *) dst;
1354
1355         if (!list_empty(&rt->rt_uncached)) {
1356                 struct uncached_list *ul = rt->rt_uncached_list;
1357
1358                 spin_lock_bh(&ul->lock);
1359                 list_del(&rt->rt_uncached);
1360                 spin_unlock_bh(&ul->lock);
1361         }
1362 }
1363
1364 void rt_flush_dev(struct net_device *dev)
1365 {
1366         struct net *net = dev_net(dev);
1367         struct rtable *rt;
1368         int cpu;
1369
1370         for_each_possible_cpu(cpu) {
1371                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1372
1373                 spin_lock_bh(&ul->lock);
1374                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1375                         if (rt->dst.dev != dev)
1376                                 continue;
1377                         rt->dst.dev = net->loopback_dev;
1378                         dev_hold(rt->dst.dev);
1379                         dev_put(dev);
1380                 }
1381                 spin_unlock_bh(&ul->lock);
1382         }
1383 }
1384
1385 static bool rt_cache_valid(const struct rtable *rt)
1386 {
1387         return  rt &&
1388                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1389                 !rt_is_expired(rt);
1390 }
1391
1392 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1393                            const struct fib_result *res,
1394                            struct fib_nh_exception *fnhe,
1395                            struct fib_info *fi, u16 type, u32 itag)
1396 {
1397         bool cached = false;
1398
1399         if (fi) {
1400                 struct fib_nh *nh = &FIB_RES_NH(*res);
1401
1402                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1403                         rt->rt_gateway = nh->nh_gw;
1404                         rt->rt_uses_gateway = 1;
1405                 }
1406                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1407 #ifdef CONFIG_IP_ROUTE_CLASSID
1408                 rt->dst.tclassid = nh->nh_tclassid;
1409 #endif
1410                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1411                 if (unlikely(fnhe))
1412                         cached = rt_bind_exception(rt, fnhe, daddr);
1413                 else if (!(rt->dst.flags & DST_NOCACHE))
1414                         cached = rt_cache_route(nh, rt);
1415                 if (unlikely(!cached)) {
1416                         /* Routes we intend to cache in nexthop exception or
1417                          * FIB nexthop have the DST_NOCACHE bit clear.
1418                          * However, if we are unsuccessful at storing this
1419                          * route into the cache we really need to set it.
1420                          */
1421                         rt->dst.flags |= DST_NOCACHE;
1422                         if (!rt->rt_gateway)
1423                                 rt->rt_gateway = daddr;
1424                         rt_add_uncached_list(rt);
1425                 }
1426         } else
1427                 rt_add_uncached_list(rt);
1428
1429 #ifdef CONFIG_IP_ROUTE_CLASSID
1430 #ifdef CONFIG_IP_MULTIPLE_TABLES
1431         set_class_tag(rt, res->tclassid);
1432 #endif
1433         set_class_tag(rt, itag);
1434 #endif
1435 }
1436
1437 static struct rtable *rt_dst_alloc(struct net_device *dev,
1438                                    bool nopolicy, bool noxfrm, bool will_cache)
1439 {
1440         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1441                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1442                          (nopolicy ? DST_NOPOLICY : 0) |
1443                          (noxfrm ? DST_NOXFRM : 0));
1444 }
1445
1446 /* called in rcu_read_lock() section */
1447 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1448                                 u8 tos, struct net_device *dev, int our)
1449 {
1450         struct rtable *rth;
1451         struct in_device *in_dev = __in_dev_get_rcu(dev);
1452         u32 itag = 0;
1453         int err;
1454
1455         /* Primary sanity checks. */
1456
1457         if (!in_dev)
1458                 return -EINVAL;
1459
1460         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1461             skb->protocol != htons(ETH_P_IP))
1462                 goto e_inval;
1463
1464         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1465                 if (ipv4_is_loopback(saddr))
1466                         goto e_inval;
1467
1468         if (ipv4_is_zeronet(saddr)) {
1469                 if (!ipv4_is_local_multicast(daddr))
1470                         goto e_inval;
1471         } else {
1472                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1473                                           in_dev, &itag);
1474                 if (err < 0)
1475                         goto e_err;
1476         }
1477         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1478                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1479         if (!rth)
1480                 goto e_nobufs;
1481
1482 #ifdef CONFIG_IP_ROUTE_CLASSID
1483         rth->dst.tclassid = itag;
1484 #endif
1485         rth->dst.output = ip_rt_bug;
1486
1487         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1488         rth->rt_flags   = RTCF_MULTICAST;
1489         rth->rt_type    = RTN_MULTICAST;
1490         rth->rt_is_input= 1;
1491         rth->rt_iif     = 0;
1492         rth->rt_pmtu    = 0;
1493         rth->rt_gateway = 0;
1494         rth->rt_uses_gateway = 0;
1495         INIT_LIST_HEAD(&rth->rt_uncached);
1496         if (our) {
1497                 rth->dst.input= ip_local_deliver;
1498                 rth->rt_flags |= RTCF_LOCAL;
1499         }
1500
1501 #ifdef CONFIG_IP_MROUTE
1502         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1503                 rth->dst.input = ip_mr_input;
1504 #endif
1505         RT_CACHE_STAT_INC(in_slow_mc);
1506
1507         skb_dst_set(skb, &rth->dst);
1508         return 0;
1509
1510 e_nobufs:
1511         return -ENOBUFS;
1512 e_inval:
1513         return -EINVAL;
1514 e_err:
1515         return err;
1516 }
1517
1518
1519 static void ip_handle_martian_source(struct net_device *dev,
1520                                      struct in_device *in_dev,
1521                                      struct sk_buff *skb,
1522                                      __be32 daddr,
1523                                      __be32 saddr)
1524 {
1525         RT_CACHE_STAT_INC(in_martian_src);
1526 #ifdef CONFIG_IP_ROUTE_VERBOSE
1527         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1528                 /*
1529                  *      RFC1812 recommendation, if source is martian,
1530                  *      the only hint is MAC header.
1531                  */
1532                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1533                         &daddr, &saddr, dev->name);
1534                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1535                         print_hex_dump(KERN_WARNING, "ll header: ",
1536                                        DUMP_PREFIX_OFFSET, 16, 1,
1537                                        skb_mac_header(skb),
1538                                        dev->hard_header_len, true);
1539                 }
1540         }
1541 #endif
1542 }
1543
1544 /* called in rcu_read_lock() section */
1545 static int __mkroute_input(struct sk_buff *skb,
1546                            const struct fib_result *res,
1547                            struct in_device *in_dev,
1548                            __be32 daddr, __be32 saddr, u32 tos)
1549 {
1550         struct fib_nh_exception *fnhe;
1551         struct rtable *rth;
1552         int err;
1553         struct in_device *out_dev;
1554         bool do_cache;
1555         u32 itag = 0;
1556
1557         /* get a working reference to the output device */
1558         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1559         if (!out_dev) {
1560                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1561                 return -EINVAL;
1562         }
1563
1564         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1565                                   in_dev->dev, in_dev, &itag);
1566         if (err < 0) {
1567                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1568                                          saddr);
1569
1570                 goto cleanup;
1571         }
1572
1573         do_cache = res->fi && !itag;
1574         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1575             skb->protocol == htons(ETH_P_IP) &&
1576             (IN_DEV_SHARED_MEDIA(out_dev) ||
1577              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1578                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1579
1580         if (skb->protocol != htons(ETH_P_IP)) {
1581                 /* Not IP (i.e. ARP). Do not create route, if it is
1582                  * invalid for proxy arp. DNAT routes are always valid.
1583                  *
1584                  * Proxy arp feature have been extended to allow, ARP
1585                  * replies back to the same interface, to support
1586                  * Private VLAN switch technologies. See arp.c.
1587                  */
1588                 if (out_dev == in_dev &&
1589                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1590                         err = -EINVAL;
1591                         goto cleanup;
1592                 }
1593         }
1594
1595         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1596         if (do_cache) {
1597                 if (fnhe)
1598                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1599                 else
1600                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1601
1602                 if (rt_cache_valid(rth)) {
1603                         skb_dst_set_noref(skb, &rth->dst);
1604                         goto out;
1605                 }
1606         }
1607
1608         rth = rt_dst_alloc(out_dev->dev,
1609                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1610                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1611         if (!rth) {
1612                 err = -ENOBUFS;
1613                 goto cleanup;
1614         }
1615
1616         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1617         rth->rt_flags = 0;
1618         rth->rt_type = res->type;
1619         rth->rt_is_input = 1;
1620         rth->rt_iif     = 0;
1621         rth->rt_pmtu    = 0;
1622         rth->rt_gateway = 0;
1623         rth->rt_uses_gateway = 0;
1624         INIT_LIST_HEAD(&rth->rt_uncached);
1625         RT_CACHE_STAT_INC(in_slow_tot);
1626
1627         rth->dst.input = ip_forward;
1628         rth->dst.output = ip_output;
1629
1630         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1631         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1632                 rth->dst.lwtstate->orig_output = rth->dst.output;
1633                 rth->dst.output = lwtunnel_output;
1634         }
1635         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1636                 rth->dst.lwtstate->orig_input = rth->dst.input;
1637                 rth->dst.input = lwtunnel_input;
1638         }
1639         skb_dst_set(skb, &rth->dst);
1640 out:
1641         err = 0;
1642  cleanup:
1643         return err;
1644 }
1645
1646 static int ip_mkroute_input(struct sk_buff *skb,
1647                             struct fib_result *res,
1648                             const struct flowi4 *fl4,
1649                             struct in_device *in_dev,
1650                             __be32 daddr, __be32 saddr, u32 tos)
1651 {
1652 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1653         if (res->fi && res->fi->fib_nhs > 1)
1654                 fib_select_multipath(res);
1655 #endif
1656
1657         /* create a routing cache entry */
1658         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1659 }
1660
1661 /*
1662  *      NOTE. We drop all the packets that has local source
1663  *      addresses, because every properly looped back packet
1664  *      must have correct destination already attached by output routine.
1665  *
1666  *      Such approach solves two big problems:
1667  *      1. Not simplex devices are handled properly.
1668  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1669  *      called with rcu_read_lock()
1670  */
1671
1672 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1673                                u8 tos, struct net_device *dev)
1674 {
1675         struct fib_result res;
1676         struct in_device *in_dev = __in_dev_get_rcu(dev);
1677         struct ip_tunnel_info *tun_info;
1678         struct flowi4   fl4;
1679         unsigned int    flags = 0;
1680         u32             itag = 0;
1681         struct rtable   *rth;
1682         int             err = -EINVAL;
1683         struct net    *net = dev_net(dev);
1684         bool do_cache;
1685
1686         /* IP on this device is disabled. */
1687
1688         if (!in_dev)
1689                 goto out;
1690
1691         /* Check for the most weird martians, which can be not detected
1692            by fib_lookup.
1693          */
1694
1695         tun_info = skb_tunnel_info(skb);
1696         if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
1697                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1698         else
1699                 fl4.flowi4_tun_key.tun_id = 0;
1700         skb_dst_drop(skb);
1701
1702         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1703                 goto martian_source;
1704
1705         res.fi = NULL;
1706         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1707                 goto brd_input;
1708
1709         /* Accept zero addresses only to limited broadcast;
1710          * I even do not know to fix it or not. Waiting for complains :-)
1711          */
1712         if (ipv4_is_zeronet(saddr))
1713                 goto martian_source;
1714
1715         if (ipv4_is_zeronet(daddr))
1716                 goto martian_destination;
1717
1718         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1719          * and call it once if daddr or/and saddr are loopback addresses
1720          */
1721         if (ipv4_is_loopback(daddr)) {
1722                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1723                         goto martian_destination;
1724         } else if (ipv4_is_loopback(saddr)) {
1725                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1726                         goto martian_source;
1727         }
1728
1729         /*
1730          *      Now we are ready to route packet.
1731          */
1732         fl4.flowi4_oif = 0;
1733         fl4.flowi4_iif = vrf_master_ifindex_rcu(dev) ? : dev->ifindex;
1734         fl4.flowi4_mark = skb->mark;
1735         fl4.flowi4_tos = tos;
1736         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1737         fl4.daddr = daddr;
1738         fl4.saddr = saddr;
1739         err = fib_lookup(net, &fl4, &res, 0);
1740         if (err != 0) {
1741                 if (!IN_DEV_FORWARD(in_dev))
1742                         err = -EHOSTUNREACH;
1743                 goto no_route;
1744         }
1745
1746         if (res.type == RTN_BROADCAST)
1747                 goto brd_input;
1748
1749         if (res.type == RTN_LOCAL) {
1750                 err = fib_validate_source(skb, saddr, daddr, tos,
1751                                           0, dev, in_dev, &itag);
1752                 if (err < 0)
1753                         goto martian_source_keep_err;
1754                 goto local_input;
1755         }
1756
1757         if (!IN_DEV_FORWARD(in_dev)) {
1758                 err = -EHOSTUNREACH;
1759                 goto no_route;
1760         }
1761         if (res.type != RTN_UNICAST)
1762                 goto martian_destination;
1763
1764         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1765 out:    return err;
1766
1767 brd_input:
1768         if (skb->protocol != htons(ETH_P_IP))
1769                 goto e_inval;
1770
1771         if (!ipv4_is_zeronet(saddr)) {
1772                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1773                                           in_dev, &itag);
1774                 if (err < 0)
1775                         goto martian_source_keep_err;
1776         }
1777         flags |= RTCF_BROADCAST;
1778         res.type = RTN_BROADCAST;
1779         RT_CACHE_STAT_INC(in_brd);
1780
1781 local_input:
1782         do_cache = false;
1783         if (res.fi) {
1784                 if (!itag) {
1785                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1786                         if (rt_cache_valid(rth)) {
1787                                 skb_dst_set_noref(skb, &rth->dst);
1788                                 err = 0;
1789                                 goto out;
1790                         }
1791                         do_cache = true;
1792                 }
1793         }
1794
1795         rth = rt_dst_alloc(net->loopback_dev,
1796                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1797         if (!rth)
1798                 goto e_nobufs;
1799
1800         rth->dst.input= ip_local_deliver;
1801         rth->dst.output= ip_rt_bug;
1802 #ifdef CONFIG_IP_ROUTE_CLASSID
1803         rth->dst.tclassid = itag;
1804 #endif
1805
1806         rth->rt_genid = rt_genid_ipv4(net);
1807         rth->rt_flags   = flags|RTCF_LOCAL;
1808         rth->rt_type    = res.type;
1809         rth->rt_is_input = 1;
1810         rth->rt_iif     = 0;
1811         rth->rt_pmtu    = 0;
1812         rth->rt_gateway = 0;
1813         rth->rt_uses_gateway = 0;
1814         INIT_LIST_HEAD(&rth->rt_uncached);
1815
1816         RT_CACHE_STAT_INC(in_slow_tot);
1817         if (res.type == RTN_UNREACHABLE) {
1818                 rth->dst.input= ip_error;
1819                 rth->dst.error= -err;
1820                 rth->rt_flags   &= ~RTCF_LOCAL;
1821         }
1822         if (do_cache) {
1823                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1824                         rth->dst.flags |= DST_NOCACHE;
1825                         rt_add_uncached_list(rth);
1826                 }
1827         }
1828         skb_dst_set(skb, &rth->dst);
1829         err = 0;
1830         goto out;
1831
1832 no_route:
1833         RT_CACHE_STAT_INC(in_no_route);
1834         res.type = RTN_UNREACHABLE;
1835         res.fi = NULL;
1836         goto local_input;
1837
1838         /*
1839          *      Do not cache martian addresses: they should be logged (RFC1812)
1840          */
1841 martian_destination:
1842         RT_CACHE_STAT_INC(in_martian_dst);
1843 #ifdef CONFIG_IP_ROUTE_VERBOSE
1844         if (IN_DEV_LOG_MARTIANS(in_dev))
1845                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1846                                      &daddr, &saddr, dev->name);
1847 #endif
1848
1849 e_inval:
1850         err = -EINVAL;
1851         goto out;
1852
1853 e_nobufs:
1854         err = -ENOBUFS;
1855         goto out;
1856
1857 martian_source:
1858         err = -EINVAL;
1859 martian_source_keep_err:
1860         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1861         goto out;
1862 }
1863
1864 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1865                          u8 tos, struct net_device *dev)
1866 {
1867         int res;
1868
1869         rcu_read_lock();
1870
1871         /* Multicast recognition logic is moved from route cache to here.
1872            The problem was that too many Ethernet cards have broken/missing
1873            hardware multicast filters :-( As result the host on multicasting
1874            network acquires a lot of useless route cache entries, sort of
1875            SDR messages from all the world. Now we try to get rid of them.
1876            Really, provided software IP multicast filter is organized
1877            reasonably (at least, hashed), it does not result in a slowdown
1878            comparing with route cache reject entries.
1879            Note, that multicast routers are not affected, because
1880            route cache entry is created eventually.
1881          */
1882         if (ipv4_is_multicast(daddr)) {
1883                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1884
1885                 if (in_dev) {
1886                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1887                                                   ip_hdr(skb)->protocol);
1888                         if (our
1889 #ifdef CONFIG_IP_MROUTE
1890                                 ||
1891                             (!ipv4_is_local_multicast(daddr) &&
1892                              IN_DEV_MFORWARD(in_dev))
1893 #endif
1894                            ) {
1895                                 int res = ip_route_input_mc(skb, daddr, saddr,
1896                                                             tos, dev, our);
1897                                 rcu_read_unlock();
1898                                 return res;
1899                         }
1900                 }
1901                 rcu_read_unlock();
1902                 return -EINVAL;
1903         }
1904         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1905         rcu_read_unlock();
1906         return res;
1907 }
1908 EXPORT_SYMBOL(ip_route_input_noref);
1909
1910 /* called with rcu_read_lock() */
1911 static struct rtable *__mkroute_output(const struct fib_result *res,
1912                                        const struct flowi4 *fl4, int orig_oif,
1913                                        struct net_device *dev_out,
1914                                        unsigned int flags)
1915 {
1916         struct fib_info *fi = res->fi;
1917         struct fib_nh_exception *fnhe;
1918         struct in_device *in_dev;
1919         u16 type = res->type;
1920         struct rtable *rth;
1921         bool do_cache;
1922
1923         in_dev = __in_dev_get_rcu(dev_out);
1924         if (!in_dev)
1925                 return ERR_PTR(-EINVAL);
1926
1927         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1928                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1929                         return ERR_PTR(-EINVAL);
1930
1931         if (ipv4_is_lbcast(fl4->daddr))
1932                 type = RTN_BROADCAST;
1933         else if (ipv4_is_multicast(fl4->daddr))
1934                 type = RTN_MULTICAST;
1935         else if (ipv4_is_zeronet(fl4->daddr))
1936                 return ERR_PTR(-EINVAL);
1937
1938         if (dev_out->flags & IFF_LOOPBACK)
1939                 flags |= RTCF_LOCAL;
1940
1941         do_cache = true;
1942         if (type == RTN_BROADCAST) {
1943                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1944                 fi = NULL;
1945         } else if (type == RTN_MULTICAST) {
1946                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1947                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1948                                      fl4->flowi4_proto))
1949                         flags &= ~RTCF_LOCAL;
1950                 else
1951                         do_cache = false;
1952                 /* If multicast route do not exist use
1953                  * default one, but do not gateway in this case.
1954                  * Yes, it is hack.
1955                  */
1956                 if (fi && res->prefixlen < 4)
1957                         fi = NULL;
1958         }
1959
1960         fnhe = NULL;
1961         do_cache &= fi != NULL;
1962         if (do_cache) {
1963                 struct rtable __rcu **prth;
1964                 struct fib_nh *nh = &FIB_RES_NH(*res);
1965
1966                 fnhe = find_exception(nh, fl4->daddr);
1967                 if (fnhe)
1968                         prth = &fnhe->fnhe_rth_output;
1969                 else {
1970                         if (unlikely(fl4->flowi4_flags &
1971                                      FLOWI_FLAG_KNOWN_NH &&
1972                                      !(nh->nh_gw &&
1973                                        nh->nh_scope == RT_SCOPE_LINK))) {
1974                                 do_cache = false;
1975                                 goto add;
1976                         }
1977                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
1978                 }
1979                 rth = rcu_dereference(*prth);
1980                 if (rt_cache_valid(rth)) {
1981                         dst_hold(&rth->dst);
1982                         return rth;
1983                 }
1984         }
1985
1986 add:
1987         rth = rt_dst_alloc(dev_out,
1988                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1989                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1990                            do_cache);
1991         if (!rth)
1992                 return ERR_PTR(-ENOBUFS);
1993
1994         rth->dst.output = ip_output;
1995
1996         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1997         rth->rt_flags   = flags;
1998         rth->rt_type    = type;
1999         rth->rt_is_input = 0;
2000         rth->rt_iif     = orig_oif ? : 0;
2001         rth->rt_pmtu    = 0;
2002         rth->rt_gateway = 0;
2003         rth->rt_uses_gateway = 0;
2004         INIT_LIST_HEAD(&rth->rt_uncached);
2005         RT_CACHE_STAT_INC(out_slow_tot);
2006
2007         if (flags & RTCF_LOCAL)
2008                 rth->dst.input = ip_local_deliver;
2009         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2010                 if (flags & RTCF_LOCAL &&
2011                     !(dev_out->flags & IFF_LOOPBACK)) {
2012                         rth->dst.output = ip_mc_output;
2013                         RT_CACHE_STAT_INC(out_slow_mc);
2014                 }
2015 #ifdef CONFIG_IP_MROUTE
2016                 if (type == RTN_MULTICAST) {
2017                         if (IN_DEV_MFORWARD(in_dev) &&
2018                             !ipv4_is_local_multicast(fl4->daddr)) {
2019                                 rth->dst.input = ip_mr_input;
2020                                 rth->dst.output = ip_mc_output;
2021                         }
2022                 }
2023 #endif
2024         }
2025
2026         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2027         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2028                 rth->dst.output = lwtunnel_output;
2029
2030         return rth;
2031 }
2032
2033 /*
2034  * Major route resolver routine.
2035  */
2036
2037 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2038 {
2039         struct net_device *dev_out = NULL;
2040         __u8 tos = RT_FL_TOS(fl4);
2041         unsigned int flags = 0;
2042         struct fib_result res;
2043         struct rtable *rth;
2044         int orig_oif;
2045
2046         res.tclassid    = 0;
2047         res.fi          = NULL;
2048         res.table       = NULL;
2049
2050         orig_oif = fl4->flowi4_oif;
2051
2052         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2053         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2054         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2055                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2056
2057         rcu_read_lock();
2058         if (fl4->saddr) {
2059                 rth = ERR_PTR(-EINVAL);
2060                 if (ipv4_is_multicast(fl4->saddr) ||
2061                     ipv4_is_lbcast(fl4->saddr) ||
2062                     ipv4_is_zeronet(fl4->saddr))
2063                         goto out;
2064
2065                 /* I removed check for oif == dev_out->oif here.
2066                    It was wrong for two reasons:
2067                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2068                       is assigned to multiple interfaces.
2069                    2. Moreover, we are allowed to send packets with saddr
2070                       of another iface. --ANK
2071                  */
2072
2073                 if (fl4->flowi4_oif == 0 &&
2074                     (ipv4_is_multicast(fl4->daddr) ||
2075                      ipv4_is_lbcast(fl4->daddr))) {
2076                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2077                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2078                         if (!dev_out)
2079                                 goto out;
2080
2081                         /* Special hack: user can direct multicasts
2082                            and limited broadcast via necessary interface
2083                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2084                            This hack is not just for fun, it allows
2085                            vic,vat and friends to work.
2086                            They bind socket to loopback, set ttl to zero
2087                            and expect that it will work.
2088                            From the viewpoint of routing cache they are broken,
2089                            because we are not allowed to build multicast path
2090                            with loopback source addr (look, routing cache
2091                            cannot know, that ttl is zero, so that packet
2092                            will not leave this host and route is valid).
2093                            Luckily, this hack is good workaround.
2094                          */
2095
2096                         fl4->flowi4_oif = dev_out->ifindex;
2097                         goto make_route;
2098                 }
2099
2100                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2101                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2102                         if (!__ip_dev_find(net, fl4->saddr, false))
2103                                 goto out;
2104                 }
2105         }
2106
2107
2108         if (fl4->flowi4_oif) {
2109                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2110                 rth = ERR_PTR(-ENODEV);
2111                 if (!dev_out)
2112                         goto out;
2113
2114                 /* RACE: Check return value of inet_select_addr instead. */
2115                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2116                         rth = ERR_PTR(-ENETUNREACH);
2117                         goto out;
2118                 }
2119                 if (ipv4_is_local_multicast(fl4->daddr) ||
2120                     ipv4_is_lbcast(fl4->daddr) ||
2121                     fl4->flowi4_proto == IPPROTO_IGMP) {
2122                         if (!fl4->saddr)
2123                                 fl4->saddr = inet_select_addr(dev_out, 0,
2124                                                               RT_SCOPE_LINK);
2125                         goto make_route;
2126                 }
2127                 if (!fl4->saddr) {
2128                         if (ipv4_is_multicast(fl4->daddr))
2129                                 fl4->saddr = inet_select_addr(dev_out, 0,
2130                                                               fl4->flowi4_scope);
2131                         else if (!fl4->daddr)
2132                                 fl4->saddr = inet_select_addr(dev_out, 0,
2133                                                               RT_SCOPE_HOST);
2134                 }
2135                 if (netif_is_vrf(dev_out) &&
2136                     !(fl4->flowi4_flags & FLOWI_FLAG_VRFSRC)) {
2137                         rth = vrf_dev_get_rth(dev_out);
2138                         goto out;
2139                 }
2140         }
2141
2142         if (!fl4->daddr) {
2143                 fl4->daddr = fl4->saddr;
2144                 if (!fl4->daddr)
2145                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2146                 dev_out = net->loopback_dev;
2147                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2148                 res.type = RTN_LOCAL;
2149                 flags |= RTCF_LOCAL;
2150                 goto make_route;
2151         }
2152
2153         if (fib_lookup(net, fl4, &res, 0)) {
2154                 res.fi = NULL;
2155                 res.table = NULL;
2156                 if (fl4->flowi4_oif) {
2157                         /* Apparently, routing tables are wrong. Assume,
2158                            that the destination is on link.
2159
2160                            WHY? DW.
2161                            Because we are allowed to send to iface
2162                            even if it has NO routes and NO assigned
2163                            addresses. When oif is specified, routing
2164                            tables are looked up with only one purpose:
2165                            to catch if destination is gatewayed, rather than
2166                            direct. Moreover, if MSG_DONTROUTE is set,
2167                            we send packet, ignoring both routing tables
2168                            and ifaddr state. --ANK
2169
2170
2171                            We could make it even if oif is unknown,
2172                            likely IPv6, but we do not.
2173                          */
2174
2175                         if (fl4->saddr == 0)
2176                                 fl4->saddr = inet_select_addr(dev_out, 0,
2177                                                               RT_SCOPE_LINK);
2178                         res.type = RTN_UNICAST;
2179                         goto make_route;
2180                 }
2181                 rth = ERR_PTR(-ENETUNREACH);
2182                 goto out;
2183         }
2184
2185         if (res.type == RTN_LOCAL) {
2186                 if (!fl4->saddr) {
2187                         if (res.fi->fib_prefsrc)
2188                                 fl4->saddr = res.fi->fib_prefsrc;
2189                         else
2190                                 fl4->saddr = fl4->daddr;
2191                 }
2192                 dev_out = net->loopback_dev;
2193                 fl4->flowi4_oif = dev_out->ifindex;
2194                 flags |= RTCF_LOCAL;
2195                 goto make_route;
2196         }
2197
2198 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2199         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2200                 fib_select_multipath(&res);
2201         else
2202 #endif
2203         if (!res.prefixlen &&
2204             res.table->tb_num_default > 1 &&
2205             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2206                 fib_select_default(fl4, &res);
2207
2208         if (!fl4->saddr)
2209                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2210
2211         dev_out = FIB_RES_DEV(res);
2212         fl4->flowi4_oif = dev_out->ifindex;
2213
2214
2215 make_route:
2216         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2217
2218 out:
2219         rcu_read_unlock();
2220         return rth;
2221 }
2222 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2223
2224 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2225 {
2226         return NULL;
2227 }
2228
2229 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2230 {
2231         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2232
2233         return mtu ? : dst->dev->mtu;
2234 }
2235
2236 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2237                                           struct sk_buff *skb, u32 mtu)
2238 {
2239 }
2240
2241 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2242                                        struct sk_buff *skb)
2243 {
2244 }
2245
2246 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2247                                           unsigned long old)
2248 {
2249         return NULL;
2250 }
2251
2252 static struct dst_ops ipv4_dst_blackhole_ops = {
2253         .family                 =       AF_INET,
2254         .check                  =       ipv4_blackhole_dst_check,
2255         .mtu                    =       ipv4_blackhole_mtu,
2256         .default_advmss         =       ipv4_default_advmss,
2257         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2258         .redirect               =       ipv4_rt_blackhole_redirect,
2259         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2260         .neigh_lookup           =       ipv4_neigh_lookup,
2261 };
2262
2263 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2264 {
2265         struct rtable *ort = (struct rtable *) dst_orig;
2266         struct rtable *rt;
2267
2268         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2269         if (rt) {
2270                 struct dst_entry *new = &rt->dst;
2271
2272                 new->__use = 1;
2273                 new->input = dst_discard;
2274                 new->output = dst_discard_sk;
2275
2276                 new->dev = ort->dst.dev;
2277                 if (new->dev)
2278                         dev_hold(new->dev);
2279
2280                 rt->rt_is_input = ort->rt_is_input;
2281                 rt->rt_iif = ort->rt_iif;
2282                 rt->rt_pmtu = ort->rt_pmtu;
2283
2284                 rt->rt_genid = rt_genid_ipv4(net);
2285                 rt->rt_flags = ort->rt_flags;
2286                 rt->rt_type = ort->rt_type;
2287                 rt->rt_gateway = ort->rt_gateway;
2288                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2289
2290                 INIT_LIST_HEAD(&rt->rt_uncached);
2291                 dst_free(new);
2292         }
2293
2294         dst_release(dst_orig);
2295
2296         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2297 }
2298
2299 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2300                                     struct sock *sk)
2301 {
2302         struct rtable *rt = __ip_route_output_key(net, flp4);
2303
2304         if (IS_ERR(rt))
2305                 return rt;
2306
2307         if (flp4->flowi4_proto)
2308                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2309                                                         flowi4_to_flowi(flp4),
2310                                                         sk, 0);
2311
2312         return rt;
2313 }
2314 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2315
2316 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2317                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2318                         u32 seq, int event, int nowait, unsigned int flags)
2319 {
2320         struct rtable *rt = skb_rtable(skb);
2321         struct rtmsg *r;
2322         struct nlmsghdr *nlh;
2323         unsigned long expires = 0;
2324         u32 error;
2325         u32 metrics[RTAX_MAX];
2326
2327         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2328         if (!nlh)
2329                 return -EMSGSIZE;
2330
2331         r = nlmsg_data(nlh);
2332         r->rtm_family    = AF_INET;
2333         r->rtm_dst_len  = 32;
2334         r->rtm_src_len  = 0;
2335         r->rtm_tos      = fl4->flowi4_tos;
2336         r->rtm_table    = RT_TABLE_MAIN;
2337         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2338                 goto nla_put_failure;
2339         r->rtm_type     = rt->rt_type;
2340         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2341         r->rtm_protocol = RTPROT_UNSPEC;
2342         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2343         if (rt->rt_flags & RTCF_NOTIFY)
2344                 r->rtm_flags |= RTM_F_NOTIFY;
2345         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2346                 r->rtm_flags |= RTCF_DOREDIRECT;
2347
2348         if (nla_put_in_addr(skb, RTA_DST, dst))
2349                 goto nla_put_failure;
2350         if (src) {
2351                 r->rtm_src_len = 32;
2352                 if (nla_put_in_addr(skb, RTA_SRC, src))
2353                         goto nla_put_failure;
2354         }
2355         if (rt->dst.dev &&
2356             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2357                 goto nla_put_failure;
2358 #ifdef CONFIG_IP_ROUTE_CLASSID
2359         if (rt->dst.tclassid &&
2360             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2361                 goto nla_put_failure;
2362 #endif
2363         if (!rt_is_input_route(rt) &&
2364             fl4->saddr != src) {
2365                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2366                         goto nla_put_failure;
2367         }
2368         if (rt->rt_uses_gateway &&
2369             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2370                 goto nla_put_failure;
2371
2372         expires = rt->dst.expires;
2373         if (expires) {
2374                 unsigned long now = jiffies;
2375
2376                 if (time_before(now, expires))
2377                         expires -= now;
2378                 else
2379                         expires = 0;
2380         }
2381
2382         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2383         if (rt->rt_pmtu && expires)
2384                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2385         if (rtnetlink_put_metrics(skb, metrics) < 0)
2386                 goto nla_put_failure;
2387
2388         if (fl4->flowi4_mark &&
2389             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2390                 goto nla_put_failure;
2391
2392         error = rt->dst.error;
2393
2394         if (rt_is_input_route(rt)) {
2395 #ifdef CONFIG_IP_MROUTE
2396                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2397                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2398                         int err = ipmr_get_route(net, skb,
2399                                                  fl4->saddr, fl4->daddr,
2400                                                  r, nowait);
2401                         if (err <= 0) {
2402                                 if (!nowait) {
2403                                         if (err == 0)
2404                                                 return 0;
2405                                         goto nla_put_failure;
2406                                 } else {
2407                                         if (err == -EMSGSIZE)
2408                                                 goto nla_put_failure;
2409                                         error = err;
2410                                 }
2411                         }
2412                 } else
2413 #endif
2414                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2415                                 goto nla_put_failure;
2416         }
2417
2418         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2419                 goto nla_put_failure;
2420
2421         nlmsg_end(skb, nlh);
2422         return 0;
2423
2424 nla_put_failure:
2425         nlmsg_cancel(skb, nlh);
2426         return -EMSGSIZE;
2427 }
2428
2429 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2430 {
2431         struct net *net = sock_net(in_skb->sk);
2432         struct rtmsg *rtm;
2433         struct nlattr *tb[RTA_MAX+1];
2434         struct rtable *rt = NULL;
2435         struct flowi4 fl4;
2436         __be32 dst = 0;
2437         __be32 src = 0;
2438         u32 iif;
2439         int err;
2440         int mark;
2441         struct sk_buff *skb;
2442
2443         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2444         if (err < 0)
2445                 goto errout;
2446
2447         rtm = nlmsg_data(nlh);
2448
2449         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2450         if (!skb) {
2451                 err = -ENOBUFS;
2452                 goto errout;
2453         }
2454
2455         /* Reserve room for dummy headers, this skb can pass
2456            through good chunk of routing engine.
2457          */
2458         skb_reset_mac_header(skb);
2459         skb_reset_network_header(skb);
2460
2461         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2462         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2463         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2464
2465         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2466         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2467         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2468         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2469
2470         memset(&fl4, 0, sizeof(fl4));
2471         fl4.daddr = dst;
2472         fl4.saddr = src;
2473         fl4.flowi4_tos = rtm->rtm_tos;
2474         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2475         fl4.flowi4_mark = mark;
2476
2477         if (iif) {
2478                 struct net_device *dev;
2479
2480                 dev = __dev_get_by_index(net, iif);
2481                 if (!dev) {
2482                         err = -ENODEV;
2483                         goto errout_free;
2484                 }
2485
2486                 skb->protocol   = htons(ETH_P_IP);
2487                 skb->dev        = dev;
2488                 skb->mark       = mark;
2489                 local_bh_disable();
2490                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2491                 local_bh_enable();
2492
2493                 rt = skb_rtable(skb);
2494                 if (err == 0 && rt->dst.error)
2495                         err = -rt->dst.error;
2496         } else {
2497                 rt = ip_route_output_key(net, &fl4);
2498
2499                 err = 0;
2500                 if (IS_ERR(rt))
2501                         err = PTR_ERR(rt);
2502         }
2503
2504         if (err)
2505                 goto errout_free;
2506
2507         skb_dst_set(skb, &rt->dst);
2508         if (rtm->rtm_flags & RTM_F_NOTIFY)
2509                 rt->rt_flags |= RTCF_NOTIFY;
2510
2511         err = rt_fill_info(net, dst, src, &fl4, skb,
2512                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2513                            RTM_NEWROUTE, 0, 0);
2514         if (err < 0)
2515                 goto errout_free;
2516
2517         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2518 errout:
2519         return err;
2520
2521 errout_free:
2522         kfree_skb(skb);
2523         goto errout;
2524 }
2525
2526 void ip_rt_multicast_event(struct in_device *in_dev)
2527 {
2528         rt_cache_flush(dev_net(in_dev->dev));
2529 }
2530
2531 #ifdef CONFIG_SYSCTL
2532 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2533 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2534 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2535 static int ip_rt_gc_elasticity __read_mostly    = 8;
2536
2537 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2538                                         void __user *buffer,
2539                                         size_t *lenp, loff_t *ppos)
2540 {
2541         struct net *net = (struct net *)__ctl->extra1;
2542
2543         if (write) {
2544                 rt_cache_flush(net);
2545                 fnhe_genid_bump(net);
2546                 return 0;
2547         }
2548
2549         return -EINVAL;
2550 }
2551
2552 static struct ctl_table ipv4_route_table[] = {
2553         {
2554                 .procname       = "gc_thresh",
2555                 .data           = &ipv4_dst_ops.gc_thresh,
2556                 .maxlen         = sizeof(int),
2557                 .mode           = 0644,
2558                 .proc_handler   = proc_dointvec,
2559         },
2560         {
2561                 .procname       = "max_size",
2562                 .data           = &ip_rt_max_size,
2563                 .maxlen         = sizeof(int),
2564                 .mode           = 0644,
2565                 .proc_handler   = proc_dointvec,
2566         },
2567         {
2568                 /*  Deprecated. Use gc_min_interval_ms */
2569
2570                 .procname       = "gc_min_interval",
2571                 .data           = &ip_rt_gc_min_interval,
2572                 .maxlen         = sizeof(int),
2573                 .mode           = 0644,
2574                 .proc_handler   = proc_dointvec_jiffies,
2575         },
2576         {
2577                 .procname       = "gc_min_interval_ms",
2578                 .data           = &ip_rt_gc_min_interval,
2579                 .maxlen         = sizeof(int),
2580                 .mode           = 0644,
2581                 .proc_handler   = proc_dointvec_ms_jiffies,
2582         },
2583         {
2584                 .procname       = "gc_timeout",
2585                 .data           = &ip_rt_gc_timeout,
2586                 .maxlen         = sizeof(int),
2587                 .mode           = 0644,
2588                 .proc_handler   = proc_dointvec_jiffies,
2589         },
2590         {
2591                 .procname       = "gc_interval",
2592                 .data           = &ip_rt_gc_interval,
2593                 .maxlen         = sizeof(int),
2594                 .mode           = 0644,
2595                 .proc_handler   = proc_dointvec_jiffies,
2596         },
2597         {
2598                 .procname       = "redirect_load",
2599                 .data           = &ip_rt_redirect_load,
2600                 .maxlen         = sizeof(int),
2601                 .mode           = 0644,
2602                 .proc_handler   = proc_dointvec,
2603         },
2604         {
2605                 .procname       = "redirect_number",
2606                 .data           = &ip_rt_redirect_number,
2607                 .maxlen         = sizeof(int),
2608                 .mode           = 0644,
2609                 .proc_handler   = proc_dointvec,
2610         },
2611         {
2612                 .procname       = "redirect_silence",
2613                 .data           = &ip_rt_redirect_silence,
2614                 .maxlen         = sizeof(int),
2615                 .mode           = 0644,
2616                 .proc_handler   = proc_dointvec,
2617         },
2618         {
2619                 .procname       = "error_cost",
2620                 .data           = &ip_rt_error_cost,
2621                 .maxlen         = sizeof(int),
2622                 .mode           = 0644,
2623                 .proc_handler   = proc_dointvec,
2624         },
2625         {
2626                 .procname       = "error_burst",
2627                 .data           = &ip_rt_error_burst,
2628                 .maxlen         = sizeof(int),
2629                 .mode           = 0644,
2630                 .proc_handler   = proc_dointvec,
2631         },
2632         {
2633                 .procname       = "gc_elasticity",
2634                 .data           = &ip_rt_gc_elasticity,
2635                 .maxlen         = sizeof(int),
2636                 .mode           = 0644,
2637                 .proc_handler   = proc_dointvec,
2638         },
2639         {
2640                 .procname       = "mtu_expires",
2641                 .data           = &ip_rt_mtu_expires,
2642                 .maxlen         = sizeof(int),
2643                 .mode           = 0644,
2644                 .proc_handler   = proc_dointvec_jiffies,
2645         },
2646         {
2647                 .procname       = "min_pmtu",
2648                 .data           = &ip_rt_min_pmtu,
2649                 .maxlen         = sizeof(int),
2650                 .mode           = 0644,
2651                 .proc_handler   = proc_dointvec,
2652         },
2653         {
2654                 .procname       = "min_adv_mss",
2655                 .data           = &ip_rt_min_advmss,
2656                 .maxlen         = sizeof(int),
2657                 .mode           = 0644,
2658                 .proc_handler   = proc_dointvec,
2659         },
2660         { }
2661 };
2662
2663 static struct ctl_table ipv4_route_flush_table[] = {
2664         {
2665                 .procname       = "flush",
2666                 .maxlen         = sizeof(int),
2667                 .mode           = 0200,
2668                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2669         },
2670         { },
2671 };
2672
2673 static __net_init int sysctl_route_net_init(struct net *net)
2674 {
2675         struct ctl_table *tbl;
2676
2677         tbl = ipv4_route_flush_table;
2678         if (!net_eq(net, &init_net)) {
2679                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2680                 if (!tbl)
2681                         goto err_dup;
2682
2683                 /* Don't export sysctls to unprivileged users */
2684                 if (net->user_ns != &init_user_ns)
2685                         tbl[0].procname = NULL;
2686         }
2687         tbl[0].extra1 = net;
2688
2689         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2690         if (!net->ipv4.route_hdr)
2691                 goto err_reg;
2692         return 0;
2693
2694 err_reg:
2695         if (tbl != ipv4_route_flush_table)
2696                 kfree(tbl);
2697 err_dup:
2698         return -ENOMEM;
2699 }
2700
2701 static __net_exit void sysctl_route_net_exit(struct net *net)
2702 {
2703         struct ctl_table *tbl;
2704
2705         tbl = net->ipv4.route_hdr->ctl_table_arg;
2706         unregister_net_sysctl_table(net->ipv4.route_hdr);
2707         BUG_ON(tbl == ipv4_route_flush_table);
2708         kfree(tbl);
2709 }
2710
2711 static __net_initdata struct pernet_operations sysctl_route_ops = {
2712         .init = sysctl_route_net_init,
2713         .exit = sysctl_route_net_exit,
2714 };
2715 #endif
2716
2717 static __net_init int rt_genid_init(struct net *net)
2718 {
2719         atomic_set(&net->ipv4.rt_genid, 0);
2720         atomic_set(&net->fnhe_genid, 0);
2721         get_random_bytes(&net->ipv4.dev_addr_genid,
2722                          sizeof(net->ipv4.dev_addr_genid));
2723         return 0;
2724 }
2725
2726 static __net_initdata struct pernet_operations rt_genid_ops = {
2727         .init = rt_genid_init,
2728 };
2729
2730 static int __net_init ipv4_inetpeer_init(struct net *net)
2731 {
2732         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2733
2734         if (!bp)
2735                 return -ENOMEM;
2736         inet_peer_base_init(bp);
2737         net->ipv4.peers = bp;
2738         return 0;
2739 }
2740
2741 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2742 {
2743         struct inet_peer_base *bp = net->ipv4.peers;
2744
2745         net->ipv4.peers = NULL;
2746         inetpeer_invalidate_tree(bp);
2747         kfree(bp);
2748 }
2749
2750 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2751         .init   =       ipv4_inetpeer_init,
2752         .exit   =       ipv4_inetpeer_exit,
2753 };
2754
2755 #ifdef CONFIG_IP_ROUTE_CLASSID
2756 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2757 #endif /* CONFIG_IP_ROUTE_CLASSID */
2758
2759 int __init ip_rt_init(void)
2760 {
2761         int rc = 0;
2762         int cpu;
2763
2764         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2765         if (!ip_idents)
2766                 panic("IP: failed to allocate ip_idents\n");
2767
2768         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2769
2770         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2771         if (!ip_tstamps)
2772                 panic("IP: failed to allocate ip_tstamps\n");
2773
2774         for_each_possible_cpu(cpu) {
2775                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2776
2777                 INIT_LIST_HEAD(&ul->head);
2778                 spin_lock_init(&ul->lock);
2779         }
2780 #ifdef CONFIG_IP_ROUTE_CLASSID
2781         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2782         if (!ip_rt_acct)
2783                 panic("IP: failed to allocate ip_rt_acct\n");
2784 #endif
2785
2786         ipv4_dst_ops.kmem_cachep =
2787                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2788                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2789
2790         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2791
2792         if (dst_entries_init(&ipv4_dst_ops) < 0)
2793                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2794
2795         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2796                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2797
2798         ipv4_dst_ops.gc_thresh = ~0;
2799         ip_rt_max_size = INT_MAX;
2800
2801         devinet_init();
2802         ip_fib_init();
2803
2804         if (ip_rt_proc_init())
2805                 pr_err("Unable to create route proc files\n");
2806 #ifdef CONFIG_XFRM
2807         xfrm_init();
2808         xfrm4_init();
2809 #endif
2810         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2811
2812 #ifdef CONFIG_SYSCTL
2813         register_pernet_subsys(&sysctl_route_ops);
2814 #endif
2815         register_pernet_subsys(&rt_genid_ops);
2816         register_pernet_subsys(&ipv4_inetpeer_ops);
2817         return rc;
2818 }
2819
2820 #ifdef CONFIG_SYSCTL
2821 /*
2822  * We really need to sanitize the damn ipv4 init order, then all
2823  * this nonsense will go away.
2824  */
2825 void __init ip_static_sysctl_init(void)
2826 {
2827         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2828 }
2829 #endif