Merge branch 'linux-linaro-lsk-v4.4' into linux-linaro-lsk-v4.4-android
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #define RT_FL_TOS(oldflp4) \
118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly       = 256;
131
132 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157
158 static struct dst_ops ipv4_dst_ops = {
159         .family =               AF_INET,
160         .check =                ipv4_dst_check,
161         .default_advmss =       ipv4_default_advmss,
162         .mtu =                  ipv4_mtu,
163         .cow_metrics =          ipv4_cow_metrics,
164         .destroy =              ipv4_dst_destroy,
165         .negative_advice =      ipv4_negative_advice,
166         .link_failure =         ipv4_link_failure,
167         .update_pmtu =          ip_rt_update_pmtu,
168         .redirect =             ip_do_redirect,
169         .local_out =            __ip_local_out,
170         .neigh_lookup =         ipv4_neigh_lookup,
171 };
172
173 #define ECN_OR_COST(class)      TC_PRIO_##class
174
175 const __u8 ip_tos2prio[16] = {
176         TC_PRIO_BESTEFFORT,
177         ECN_OR_COST(BESTEFFORT),
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BULK,
181         ECN_OR_COST(BULK),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_INTERACTIVE,
185         ECN_OR_COST(INTERACTIVE),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE_BULK,
189         ECN_OR_COST(INTERACTIVE_BULK),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK)
192 };
193 EXPORT_SYMBOL(ip_tos2prio);
194
195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
197
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
200 {
201         if (*pos)
202                 return NULL;
203         return SEQ_START_TOKEN;
204 }
205
206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
207 {
208         ++*pos;
209         return NULL;
210 }
211
212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
213 {
214 }
215
216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
217 {
218         if (v == SEQ_START_TOKEN)
219                 seq_printf(seq, "%-127s\n",
220                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
222                            "HHUptod\tSpecDst");
223         return 0;
224 }
225
226 static const struct seq_operations rt_cache_seq_ops = {
227         .start  = rt_cache_seq_start,
228         .next   = rt_cache_seq_next,
229         .stop   = rt_cache_seq_stop,
230         .show   = rt_cache_seq_show,
231 };
232
233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
234 {
235         return seq_open(file, &rt_cache_seq_ops);
236 }
237
238 static const struct file_operations rt_cache_seq_fops = {
239         .owner   = THIS_MODULE,
240         .open    = rt_cache_seq_open,
241         .read    = seq_read,
242         .llseek  = seq_lseek,
243         .release = seq_release,
244 };
245
246
247 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
248 {
249         int cpu;
250
251         if (*pos == 0)
252                 return SEQ_START_TOKEN;
253
254         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
255                 if (!cpu_possible(cpu))
256                         continue;
257                 *pos = cpu+1;
258                 return &per_cpu(rt_cache_stat, cpu);
259         }
260         return NULL;
261 }
262
263 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
264 {
265         int cpu;
266
267         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
268                 if (!cpu_possible(cpu))
269                         continue;
270                 *pos = cpu+1;
271                 return &per_cpu(rt_cache_stat, cpu);
272         }
273         return NULL;
274
275 }
276
277 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
278 {
279
280 }
281
282 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
283 {
284         struct rt_cache_stat *st = v;
285
286         if (v == SEQ_START_TOKEN) {
287                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
288                 return 0;
289         }
290
291         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
292                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
293                    dst_entries_get_slow(&ipv4_dst_ops),
294                    0, /* st->in_hit */
295                    st->in_slow_tot,
296                    st->in_slow_mc,
297                    st->in_no_route,
298                    st->in_brd,
299                    st->in_martian_dst,
300                    st->in_martian_src,
301
302                    0, /* st->out_hit */
303                    st->out_slow_tot,
304                    st->out_slow_mc,
305
306                    0, /* st->gc_total */
307                    0, /* st->gc_ignored */
308                    0, /* st->gc_goal_miss */
309                    0, /* st->gc_dst_overflow */
310                    0, /* st->in_hlist_search */
311                    0  /* st->out_hlist_search */
312                 );
313         return 0;
314 }
315
316 static const struct seq_operations rt_cpu_seq_ops = {
317         .start  = rt_cpu_seq_start,
318         .next   = rt_cpu_seq_next,
319         .stop   = rt_cpu_seq_stop,
320         .show   = rt_cpu_seq_show,
321 };
322
323
324 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
325 {
326         return seq_open(file, &rt_cpu_seq_ops);
327 }
328
329 static const struct file_operations rt_cpu_seq_fops = {
330         .owner   = THIS_MODULE,
331         .open    = rt_cpu_seq_open,
332         .read    = seq_read,
333         .llseek  = seq_lseek,
334         .release = seq_release,
335 };
336
337 #ifdef CONFIG_IP_ROUTE_CLASSID
338 static int rt_acct_proc_show(struct seq_file *m, void *v)
339 {
340         struct ip_rt_acct *dst, *src;
341         unsigned int i, j;
342
343         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
344         if (!dst)
345                 return -ENOMEM;
346
347         for_each_possible_cpu(i) {
348                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
349                 for (j = 0; j < 256; j++) {
350                         dst[j].o_bytes   += src[j].o_bytes;
351                         dst[j].o_packets += src[j].o_packets;
352                         dst[j].i_bytes   += src[j].i_bytes;
353                         dst[j].i_packets += src[j].i_packets;
354                 }
355         }
356
357         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
358         kfree(dst);
359         return 0;
360 }
361
362 static int rt_acct_proc_open(struct inode *inode, struct file *file)
363 {
364         return single_open(file, rt_acct_proc_show, NULL);
365 }
366
367 static const struct file_operations rt_acct_proc_fops = {
368         .owner          = THIS_MODULE,
369         .open           = rt_acct_proc_open,
370         .read           = seq_read,
371         .llseek         = seq_lseek,
372         .release        = single_release,
373 };
374 #endif
375
376 static int __net_init ip_rt_do_proc_init(struct net *net)
377 {
378         struct proc_dir_entry *pde;
379
380         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
381                           &rt_cache_seq_fops);
382         if (!pde)
383                 goto err1;
384
385         pde = proc_create("rt_cache", S_IRUGO,
386                           net->proc_net_stat, &rt_cpu_seq_fops);
387         if (!pde)
388                 goto err2;
389
390 #ifdef CONFIG_IP_ROUTE_CLASSID
391         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
392         if (!pde)
393                 goto err3;
394 #endif
395         return 0;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 err3:
399         remove_proc_entry("rt_cache", net->proc_net_stat);
400 #endif
401 err2:
402         remove_proc_entry("rt_cache", net->proc_net);
403 err1:
404         return -ENOMEM;
405 }
406
407 static void __net_exit ip_rt_do_proc_exit(struct net *net)
408 {
409         remove_proc_entry("rt_cache", net->proc_net_stat);
410         remove_proc_entry("rt_cache", net->proc_net);
411 #ifdef CONFIG_IP_ROUTE_CLASSID
412         remove_proc_entry("rt_acct", net->proc_net);
413 #endif
414 }
415
416 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
417         .init = ip_rt_do_proc_init,
418         .exit = ip_rt_do_proc_exit,
419 };
420
421 static int __init ip_rt_proc_init(void)
422 {
423         return register_pernet_subsys(&ip_rt_proc_ops);
424 }
425
426 #else
427 static inline int ip_rt_proc_init(void)
428 {
429         return 0;
430 }
431 #endif /* CONFIG_PROC_FS */
432
433 static inline bool rt_is_expired(const struct rtable *rth)
434 {
435         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
436 }
437
438 void rt_cache_flush(struct net *net)
439 {
440         rt_genid_bump_ipv4(net);
441 }
442
443 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
444                                            struct sk_buff *skb,
445                                            const void *daddr)
446 {
447         struct net_device *dev = dst->dev;
448         const __be32 *pkey = daddr;
449         const struct rtable *rt;
450         struct neighbour *n;
451
452         rt = (const struct rtable *) dst;
453         if (rt->rt_gateway)
454                 pkey = (const __be32 *) &rt->rt_gateway;
455         else if (skb)
456                 pkey = &ip_hdr(skb)->daddr;
457
458         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
459         if (n)
460                 return n;
461         return neigh_create(&arp_tbl, pkey, dev);
462 }
463
464 #define IP_IDENTS_SZ 2048u
465
466 static atomic_t *ip_idents __read_mostly;
467 static u32 *ip_tstamps __read_mostly;
468
469 /* In order to protect privacy, we add a perturbation to identifiers
470  * if one generator is seldom used. This makes hard for an attacker
471  * to infer how many packets were sent between two points in time.
472  */
473 u32 ip_idents_reserve(u32 hash, int segs)
474 {
475         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
476         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
477         u32 old = ACCESS_ONCE(*p_tstamp);
478         u32 now = (u32)jiffies;
479         u32 delta = 0;
480
481         if (old != now && cmpxchg(p_tstamp, old, now) == old)
482                 delta = prandom_u32_max(now - old);
483
484         return atomic_add_return(segs + delta, p_id) - segs;
485 }
486 EXPORT_SYMBOL(ip_idents_reserve);
487
488 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
489 {
490         static u32 ip_idents_hashrnd __read_mostly;
491         u32 hash, id;
492
493         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
494
495         hash = jhash_3words((__force u32)iph->daddr,
496                             (__force u32)iph->saddr,
497                             iph->protocol ^ net_hash_mix(net),
498                             ip_idents_hashrnd);
499         id = ip_idents_reserve(hash, segs);
500         iph->id = htons(id);
501 }
502 EXPORT_SYMBOL(__ip_select_ident);
503
504 static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
505                              const struct iphdr *iph,
506                              int oif, u8 tos,
507                              u8 prot, u32 mark, int flow_flags)
508 {
509         if (sk) {
510                 const struct inet_sock *inet = inet_sk(sk);
511
512                 oif = sk->sk_bound_dev_if;
513                 mark = sk->sk_mark;
514                 tos = RT_CONN_FLAGS(sk);
515                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
516         }
517         flowi4_init_output(fl4, oif, mark, tos,
518                            RT_SCOPE_UNIVERSE, prot,
519                            flow_flags,
520                            iph->daddr, iph->saddr, 0, 0,
521                            sk ? sock_i_uid(sk) : GLOBAL_ROOT_UID);
522 }
523
524 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
525                                struct sock *sk)
526 {
527         const struct iphdr *iph = ip_hdr(skb);
528         int oif = skb->dev->ifindex;
529         u8 tos = RT_TOS(iph->tos);
530         u8 prot = iph->protocol;
531         u32 mark = skb->mark;
532
533         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
534 }
535
536 static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
537 {
538         const struct inet_sock *inet = inet_sk(sk);
539         const struct ip_options_rcu *inet_opt;
540         __be32 daddr = inet->inet_daddr;
541
542         rcu_read_lock();
543         inet_opt = rcu_dereference(inet->inet_opt);
544         if (inet_opt && inet_opt->opt.srr)
545                 daddr = inet_opt->opt.faddr;
546         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
547                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
548                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
549                            inet_sk_flowi_flags(sk),
550                            daddr, inet->inet_saddr, 0, 0,
551                            sock_i_uid(sk));
552         rcu_read_unlock();
553 }
554
555 static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
556                                  const struct sk_buff *skb)
557 {
558         if (skb)
559                 build_skb_flow_key(fl4, skb, sk);
560         else
561                 build_sk_flow_key(fl4, sk);
562 }
563
564 static inline void rt_free(struct rtable *rt)
565 {
566         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
567 }
568
569 static DEFINE_SPINLOCK(fnhe_lock);
570
571 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
572 {
573         struct rtable *rt;
574
575         rt = rcu_dereference(fnhe->fnhe_rth_input);
576         if (rt) {
577                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
578                 rt_free(rt);
579         }
580         rt = rcu_dereference(fnhe->fnhe_rth_output);
581         if (rt) {
582                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
583                 rt_free(rt);
584         }
585 }
586
587 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
588 {
589         struct fib_nh_exception *fnhe, *oldest;
590
591         oldest = rcu_dereference(hash->chain);
592         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
593              fnhe = rcu_dereference(fnhe->fnhe_next)) {
594                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
595                         oldest = fnhe;
596         }
597         fnhe_flush_routes(oldest);
598         return oldest;
599 }
600
601 static inline u32 fnhe_hashfun(__be32 daddr)
602 {
603         static u32 fnhe_hashrnd __read_mostly;
604         u32 hval;
605
606         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
607         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
608         return hash_32(hval, FNHE_HASH_SHIFT);
609 }
610
611 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
612 {
613         rt->rt_pmtu = fnhe->fnhe_pmtu;
614         rt->dst.expires = fnhe->fnhe_expires;
615
616         if (fnhe->fnhe_gw) {
617                 rt->rt_flags |= RTCF_REDIRECTED;
618                 rt->rt_gateway = fnhe->fnhe_gw;
619                 rt->rt_uses_gateway = 1;
620         }
621 }
622
623 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
624                                   u32 pmtu, unsigned long expires)
625 {
626         struct fnhe_hash_bucket *hash;
627         struct fib_nh_exception *fnhe;
628         struct rtable *rt;
629         unsigned int i;
630         int depth;
631         u32 hval = fnhe_hashfun(daddr);
632
633         spin_lock_bh(&fnhe_lock);
634
635         hash = rcu_dereference(nh->nh_exceptions);
636         if (!hash) {
637                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
638                 if (!hash)
639                         goto out_unlock;
640                 rcu_assign_pointer(nh->nh_exceptions, hash);
641         }
642
643         hash += hval;
644
645         depth = 0;
646         for (fnhe = rcu_dereference(hash->chain); fnhe;
647              fnhe = rcu_dereference(fnhe->fnhe_next)) {
648                 if (fnhe->fnhe_daddr == daddr)
649                         break;
650                 depth++;
651         }
652
653         if (fnhe) {
654                 if (gw)
655                         fnhe->fnhe_gw = gw;
656                 if (pmtu) {
657                         fnhe->fnhe_pmtu = pmtu;
658                         fnhe->fnhe_expires = max(1UL, expires);
659                 }
660                 /* Update all cached dsts too */
661                 rt = rcu_dereference(fnhe->fnhe_rth_input);
662                 if (rt)
663                         fill_route_from_fnhe(rt, fnhe);
664                 rt = rcu_dereference(fnhe->fnhe_rth_output);
665                 if (rt)
666                         fill_route_from_fnhe(rt, fnhe);
667         } else {
668                 if (depth > FNHE_RECLAIM_DEPTH)
669                         fnhe = fnhe_oldest(hash);
670                 else {
671                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
672                         if (!fnhe)
673                                 goto out_unlock;
674
675                         fnhe->fnhe_next = hash->chain;
676                         rcu_assign_pointer(hash->chain, fnhe);
677                 }
678                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
679                 fnhe->fnhe_daddr = daddr;
680                 fnhe->fnhe_gw = gw;
681                 fnhe->fnhe_pmtu = pmtu;
682                 fnhe->fnhe_expires = expires;
683
684                 /* Exception created; mark the cached routes for the nexthop
685                  * stale, so anyone caching it rechecks if this exception
686                  * applies to them.
687                  */
688                 rt = rcu_dereference(nh->nh_rth_input);
689                 if (rt)
690                         rt->dst.obsolete = DST_OBSOLETE_KILL;
691
692                 for_each_possible_cpu(i) {
693                         struct rtable __rcu **prt;
694                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
695                         rt = rcu_dereference(*prt);
696                         if (rt)
697                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
698                 }
699         }
700
701         fnhe->fnhe_stamp = jiffies;
702
703 out_unlock:
704         spin_unlock_bh(&fnhe_lock);
705 }
706
707 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
708                              bool kill_route)
709 {
710         __be32 new_gw = icmp_hdr(skb)->un.gateway;
711         __be32 old_gw = ip_hdr(skb)->saddr;
712         struct net_device *dev = skb->dev;
713         struct in_device *in_dev;
714         struct fib_result res;
715         struct neighbour *n;
716         struct net *net;
717
718         switch (icmp_hdr(skb)->code & 7) {
719         case ICMP_REDIR_NET:
720         case ICMP_REDIR_NETTOS:
721         case ICMP_REDIR_HOST:
722         case ICMP_REDIR_HOSTTOS:
723                 break;
724
725         default:
726                 return;
727         }
728
729         if (rt->rt_gateway != old_gw)
730                 return;
731
732         in_dev = __in_dev_get_rcu(dev);
733         if (!in_dev)
734                 return;
735
736         net = dev_net(dev);
737         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
738             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
739             ipv4_is_zeronet(new_gw))
740                 goto reject_redirect;
741
742         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
743                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
744                         goto reject_redirect;
745                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
746                         goto reject_redirect;
747         } else {
748                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
749                         goto reject_redirect;
750         }
751
752         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
753         if (!n)
754                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
755         if (!IS_ERR(n)) {
756                 if (!(n->nud_state & NUD_VALID)) {
757                         neigh_event_send(n, NULL);
758                 } else {
759                         if (fib_lookup(net, fl4, &res, 0) == 0) {
760                                 struct fib_nh *nh = &FIB_RES_NH(res);
761
762                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
763                                                 0, jiffies + ip_rt_gc_timeout);
764                         }
765                         if (kill_route)
766                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
767                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
768                 }
769                 neigh_release(n);
770         }
771         return;
772
773 reject_redirect:
774 #ifdef CONFIG_IP_ROUTE_VERBOSE
775         if (IN_DEV_LOG_MARTIANS(in_dev)) {
776                 const struct iphdr *iph = (const struct iphdr *) skb->data;
777                 __be32 daddr = iph->daddr;
778                 __be32 saddr = iph->saddr;
779
780                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
781                                      "  Advised path = %pI4 -> %pI4\n",
782                                      &old_gw, dev->name, &new_gw,
783                                      &saddr, &daddr);
784         }
785 #endif
786         ;
787 }
788
789 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
790 {
791         struct rtable *rt;
792         struct flowi4 fl4;
793         const struct iphdr *iph = (const struct iphdr *) skb->data;
794         int oif = skb->dev->ifindex;
795         u8 tos = RT_TOS(iph->tos);
796         u8 prot = iph->protocol;
797         u32 mark = skb->mark;
798
799         rt = (struct rtable *) dst;
800
801         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
802         __ip_do_redirect(rt, skb, &fl4, true);
803 }
804
805 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
806 {
807         struct rtable *rt = (struct rtable *)dst;
808         struct dst_entry *ret = dst;
809
810         if (rt) {
811                 if (dst->obsolete > 0) {
812                         ip_rt_put(rt);
813                         ret = NULL;
814                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
815                            rt->dst.expires) {
816                         ip_rt_put(rt);
817                         ret = NULL;
818                 }
819         }
820         return ret;
821 }
822
823 /*
824  * Algorithm:
825  *      1. The first ip_rt_redirect_number redirects are sent
826  *         with exponential backoff, then we stop sending them at all,
827  *         assuming that the host ignores our redirects.
828  *      2. If we did not see packets requiring redirects
829  *         during ip_rt_redirect_silence, we assume that the host
830  *         forgot redirected route and start to send redirects again.
831  *
832  * This algorithm is much cheaper and more intelligent than dumb load limiting
833  * in icmp.c.
834  *
835  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
836  * and "frag. need" (breaks PMTU discovery) in icmp.c.
837  */
838
839 void ip_rt_send_redirect(struct sk_buff *skb)
840 {
841         struct rtable *rt = skb_rtable(skb);
842         struct in_device *in_dev;
843         struct inet_peer *peer;
844         struct net *net;
845         int log_martians;
846         int vif;
847
848         rcu_read_lock();
849         in_dev = __in_dev_get_rcu(rt->dst.dev);
850         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
851                 rcu_read_unlock();
852                 return;
853         }
854         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
855         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
856         rcu_read_unlock();
857
858         net = dev_net(rt->dst.dev);
859         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
860         if (!peer) {
861                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
862                           rt_nexthop(rt, ip_hdr(skb)->daddr));
863                 return;
864         }
865
866         /* No redirected packets during ip_rt_redirect_silence;
867          * reset the algorithm.
868          */
869         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
870                 peer->rate_tokens = 0;
871
872         /* Too many ignored redirects; do not send anything
873          * set dst.rate_last to the last seen redirected packet.
874          */
875         if (peer->rate_tokens >= ip_rt_redirect_number) {
876                 peer->rate_last = jiffies;
877                 goto out_put_peer;
878         }
879
880         /* Check for load limit; set rate_last to the latest sent
881          * redirect.
882          */
883         if (peer->rate_tokens == 0 ||
884             time_after(jiffies,
885                        (peer->rate_last +
886                         (ip_rt_redirect_load << peer->rate_tokens)))) {
887                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
888
889                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
890                 peer->rate_last = jiffies;
891                 ++peer->rate_tokens;
892 #ifdef CONFIG_IP_ROUTE_VERBOSE
893                 if (log_martians &&
894                     peer->rate_tokens == ip_rt_redirect_number)
895                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
896                                              &ip_hdr(skb)->saddr, inet_iif(skb),
897                                              &ip_hdr(skb)->daddr, &gw);
898 #endif
899         }
900 out_put_peer:
901         inet_putpeer(peer);
902 }
903
904 static int ip_error(struct sk_buff *skb)
905 {
906         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
907         struct rtable *rt = skb_rtable(skb);
908         struct inet_peer *peer;
909         unsigned long now;
910         struct net *net;
911         bool send;
912         int code;
913
914         /* IP on this device is disabled. */
915         if (!in_dev)
916                 goto out;
917
918         net = dev_net(rt->dst.dev);
919         if (!IN_DEV_FORWARD(in_dev)) {
920                 switch (rt->dst.error) {
921                 case EHOSTUNREACH:
922                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
923                         break;
924
925                 case ENETUNREACH:
926                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
927                         break;
928                 }
929                 goto out;
930         }
931
932         switch (rt->dst.error) {
933         case EINVAL:
934         default:
935                 goto out;
936         case EHOSTUNREACH:
937                 code = ICMP_HOST_UNREACH;
938                 break;
939         case ENETUNREACH:
940                 code = ICMP_NET_UNREACH;
941                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
942                 break;
943         case EACCES:
944                 code = ICMP_PKT_FILTERED;
945                 break;
946         }
947
948         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
949                                l3mdev_master_ifindex(skb->dev), 1);
950
951         send = true;
952         if (peer) {
953                 now = jiffies;
954                 peer->rate_tokens += now - peer->rate_last;
955                 if (peer->rate_tokens > ip_rt_error_burst)
956                         peer->rate_tokens = ip_rt_error_burst;
957                 peer->rate_last = now;
958                 if (peer->rate_tokens >= ip_rt_error_cost)
959                         peer->rate_tokens -= ip_rt_error_cost;
960                 else
961                         send = false;
962                 inet_putpeer(peer);
963         }
964         if (send)
965                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
966
967 out:    kfree_skb(skb);
968         return 0;
969 }
970
971 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
972 {
973         struct dst_entry *dst = &rt->dst;
974         struct fib_result res;
975
976         if (dst_metric_locked(dst, RTAX_MTU))
977                 return;
978
979         if (ipv4_mtu(dst) < mtu)
980                 return;
981
982         if (mtu < ip_rt_min_pmtu)
983                 mtu = ip_rt_min_pmtu;
984
985         if (rt->rt_pmtu == mtu &&
986             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
987                 return;
988
989         rcu_read_lock();
990         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
991                 struct fib_nh *nh = &FIB_RES_NH(res);
992
993                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
994                                       jiffies + ip_rt_mtu_expires);
995         }
996         rcu_read_unlock();
997 }
998
999 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1000                               struct sk_buff *skb, u32 mtu)
1001 {
1002         struct rtable *rt = (struct rtable *) dst;
1003         struct flowi4 fl4;
1004
1005         ip_rt_build_flow_key(&fl4, sk, skb);
1006         __ip_rt_update_pmtu(rt, &fl4, mtu);
1007 }
1008
1009 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1010                       int oif, u32 mark, u8 protocol, int flow_flags)
1011 {
1012         const struct iphdr *iph = (const struct iphdr *) skb->data;
1013         struct flowi4 fl4;
1014         struct rtable *rt;
1015
1016         if (!mark)
1017                 mark = IP4_REPLY_MARK(net, skb->mark);
1018
1019         __build_flow_key(&fl4, NULL, iph, oif,
1020                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1021         rt = __ip_route_output_key(net, &fl4);
1022         if (!IS_ERR(rt)) {
1023                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1024                 ip_rt_put(rt);
1025         }
1026 }
1027 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1028
1029 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1030 {
1031         const struct iphdr *iph = (const struct iphdr *) skb->data;
1032         struct flowi4 fl4;
1033         struct rtable *rt;
1034
1035         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1036
1037         if (!fl4.flowi4_mark)
1038                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1039
1040         rt = __ip_route_output_key(sock_net(sk), &fl4);
1041         if (!IS_ERR(rt)) {
1042                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1043                 ip_rt_put(rt);
1044         }
1045 }
1046
1047 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1048 {
1049         const struct iphdr *iph = (const struct iphdr *) skb->data;
1050         struct flowi4 fl4;
1051         struct rtable *rt;
1052         struct dst_entry *odst = NULL;
1053         bool new = false;
1054
1055         bh_lock_sock(sk);
1056
1057         if (!ip_sk_accept_pmtu(sk))
1058                 goto out;
1059
1060         odst = sk_dst_get(sk);
1061
1062         if (sock_owned_by_user(sk) || !odst) {
1063                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1064                 goto out;
1065         }
1066
1067         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1068
1069         rt = (struct rtable *)odst;
1070         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1071                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1072                 if (IS_ERR(rt))
1073                         goto out;
1074
1075                 new = true;
1076         }
1077
1078         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1079
1080         if (!dst_check(&rt->dst, 0)) {
1081                 if (new)
1082                         dst_release(&rt->dst);
1083
1084                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1085                 if (IS_ERR(rt))
1086                         goto out;
1087
1088                 new = true;
1089         }
1090
1091         if (new)
1092                 sk_dst_set(sk, &rt->dst);
1093
1094 out:
1095         bh_unlock_sock(sk);
1096         dst_release(odst);
1097 }
1098 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1099
1100 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1101                    int oif, u32 mark, u8 protocol, int flow_flags)
1102 {
1103         const struct iphdr *iph = (const struct iphdr *) skb->data;
1104         struct flowi4 fl4;
1105         struct rtable *rt;
1106
1107         __build_flow_key(&fl4, NULL, iph, oif,
1108                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1109         rt = __ip_route_output_key(net, &fl4);
1110         if (!IS_ERR(rt)) {
1111                 __ip_do_redirect(rt, skb, &fl4, false);
1112                 ip_rt_put(rt);
1113         }
1114 }
1115 EXPORT_SYMBOL_GPL(ipv4_redirect);
1116
1117 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1118 {
1119         const struct iphdr *iph = (const struct iphdr *) skb->data;
1120         struct flowi4 fl4;
1121         struct rtable *rt;
1122
1123         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1124         rt = __ip_route_output_key(sock_net(sk), &fl4);
1125         if (!IS_ERR(rt)) {
1126                 __ip_do_redirect(rt, skb, &fl4, false);
1127                 ip_rt_put(rt);
1128         }
1129 }
1130 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1131
1132 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1133 {
1134         struct rtable *rt = (struct rtable *) dst;
1135
1136         /* All IPV4 dsts are created with ->obsolete set to the value
1137          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1138          * into this function always.
1139          *
1140          * When a PMTU/redirect information update invalidates a route,
1141          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1142          * DST_OBSOLETE_DEAD by dst_free().
1143          */
1144         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1145                 return NULL;
1146         return dst;
1147 }
1148
1149 static void ipv4_link_failure(struct sk_buff *skb)
1150 {
1151         struct rtable *rt;
1152
1153         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1154
1155         rt = skb_rtable(skb);
1156         if (rt)
1157                 dst_set_expires(&rt->dst, 0);
1158 }
1159
1160 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1161 {
1162         pr_debug("%s: %pI4 -> %pI4, %s\n",
1163                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1164                  skb->dev ? skb->dev->name : "?");
1165         kfree_skb(skb);
1166         WARN_ON(1);
1167         return 0;
1168 }
1169
1170 /*
1171    We do not cache source address of outgoing interface,
1172    because it is used only by IP RR, TS and SRR options,
1173    so that it out of fast path.
1174
1175    BTW remember: "addr" is allowed to be not aligned
1176    in IP options!
1177  */
1178
1179 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1180 {
1181         __be32 src;
1182
1183         if (rt_is_output_route(rt))
1184                 src = ip_hdr(skb)->saddr;
1185         else {
1186                 struct fib_result res;
1187                 struct flowi4 fl4;
1188                 struct iphdr *iph;
1189
1190                 iph = ip_hdr(skb);
1191
1192                 memset(&fl4, 0, sizeof(fl4));
1193                 fl4.daddr = iph->daddr;
1194                 fl4.saddr = iph->saddr;
1195                 fl4.flowi4_tos = RT_TOS(iph->tos);
1196                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1197                 fl4.flowi4_iif = skb->dev->ifindex;
1198                 fl4.flowi4_mark = skb->mark;
1199
1200                 rcu_read_lock();
1201                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1202                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1203                 else
1204                         src = inet_select_addr(rt->dst.dev,
1205                                                rt_nexthop(rt, iph->daddr),
1206                                                RT_SCOPE_UNIVERSE);
1207                 rcu_read_unlock();
1208         }
1209         memcpy(addr, &src, 4);
1210 }
1211
1212 #ifdef CONFIG_IP_ROUTE_CLASSID
1213 static void set_class_tag(struct rtable *rt, u32 tag)
1214 {
1215         if (!(rt->dst.tclassid & 0xFFFF))
1216                 rt->dst.tclassid |= tag & 0xFFFF;
1217         if (!(rt->dst.tclassid & 0xFFFF0000))
1218                 rt->dst.tclassid |= tag & 0xFFFF0000;
1219 }
1220 #endif
1221
1222 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1223 {
1224         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1225
1226         if (advmss == 0) {
1227                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1228                                ip_rt_min_advmss);
1229                 if (advmss > 65535 - 40)
1230                         advmss = 65535 - 40;
1231         }
1232         return advmss;
1233 }
1234
1235 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1236 {
1237         const struct rtable *rt = (const struct rtable *) dst;
1238         unsigned int mtu = rt->rt_pmtu;
1239
1240         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1241                 mtu = dst_metric_raw(dst, RTAX_MTU);
1242
1243         if (mtu)
1244                 return mtu;
1245
1246         mtu = dst->dev->mtu;
1247
1248         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1249                 if (rt->rt_uses_gateway && mtu > 576)
1250                         mtu = 576;
1251         }
1252
1253         return min_t(unsigned int, mtu, IP_MAX_MTU);
1254 }
1255
1256 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1257 {
1258         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1259         struct fib_nh_exception *fnhe;
1260         u32 hval;
1261
1262         if (!hash)
1263                 return NULL;
1264
1265         hval = fnhe_hashfun(daddr);
1266
1267         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1268              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1269                 if (fnhe->fnhe_daddr == daddr)
1270                         return fnhe;
1271         }
1272         return NULL;
1273 }
1274
1275 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1276                               __be32 daddr)
1277 {
1278         bool ret = false;
1279
1280         spin_lock_bh(&fnhe_lock);
1281
1282         if (daddr == fnhe->fnhe_daddr) {
1283                 struct rtable __rcu **porig;
1284                 struct rtable *orig;
1285                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1286
1287                 if (rt_is_input_route(rt))
1288                         porig = &fnhe->fnhe_rth_input;
1289                 else
1290                         porig = &fnhe->fnhe_rth_output;
1291                 orig = rcu_dereference(*porig);
1292
1293                 if (fnhe->fnhe_genid != genid) {
1294                         fnhe->fnhe_genid = genid;
1295                         fnhe->fnhe_gw = 0;
1296                         fnhe->fnhe_pmtu = 0;
1297                         fnhe->fnhe_expires = 0;
1298                         fnhe_flush_routes(fnhe);
1299                         orig = NULL;
1300                 }
1301                 fill_route_from_fnhe(rt, fnhe);
1302                 if (!rt->rt_gateway)
1303                         rt->rt_gateway = daddr;
1304
1305                 if (!(rt->dst.flags & DST_NOCACHE)) {
1306                         rcu_assign_pointer(*porig, rt);
1307                         if (orig)
1308                                 rt_free(orig);
1309                         ret = true;
1310                 }
1311
1312                 fnhe->fnhe_stamp = jiffies;
1313         }
1314         spin_unlock_bh(&fnhe_lock);
1315
1316         return ret;
1317 }
1318
1319 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1320 {
1321         struct rtable *orig, *prev, **p;
1322         bool ret = true;
1323
1324         if (rt_is_input_route(rt)) {
1325                 p = (struct rtable **)&nh->nh_rth_input;
1326         } else {
1327                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1328         }
1329         orig = *p;
1330
1331         prev = cmpxchg(p, orig, rt);
1332         if (prev == orig) {
1333                 if (orig)
1334                         rt_free(orig);
1335         } else
1336                 ret = false;
1337
1338         return ret;
1339 }
1340
1341 struct uncached_list {
1342         spinlock_t              lock;
1343         struct list_head        head;
1344 };
1345
1346 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1347
1348 static void rt_add_uncached_list(struct rtable *rt)
1349 {
1350         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1351
1352         rt->rt_uncached_list = ul;
1353
1354         spin_lock_bh(&ul->lock);
1355         list_add_tail(&rt->rt_uncached, &ul->head);
1356         spin_unlock_bh(&ul->lock);
1357 }
1358
1359 static void ipv4_dst_destroy(struct dst_entry *dst)
1360 {
1361         struct rtable *rt = (struct rtable *) dst;
1362
1363         if (!list_empty(&rt->rt_uncached)) {
1364                 struct uncached_list *ul = rt->rt_uncached_list;
1365
1366                 spin_lock_bh(&ul->lock);
1367                 list_del(&rt->rt_uncached);
1368                 spin_unlock_bh(&ul->lock);
1369         }
1370 }
1371
1372 void rt_flush_dev(struct net_device *dev)
1373 {
1374         struct net *net = dev_net(dev);
1375         struct rtable *rt;
1376         int cpu;
1377
1378         for_each_possible_cpu(cpu) {
1379                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1380
1381                 spin_lock_bh(&ul->lock);
1382                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1383                         if (rt->dst.dev != dev)
1384                                 continue;
1385                         rt->dst.dev = net->loopback_dev;
1386                         dev_hold(rt->dst.dev);
1387                         dev_put(dev);
1388                 }
1389                 spin_unlock_bh(&ul->lock);
1390         }
1391 }
1392
1393 static bool rt_cache_valid(const struct rtable *rt)
1394 {
1395         return  rt &&
1396                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1397                 !rt_is_expired(rt);
1398 }
1399
1400 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1401                            const struct fib_result *res,
1402                            struct fib_nh_exception *fnhe,
1403                            struct fib_info *fi, u16 type, u32 itag)
1404 {
1405         bool cached = false;
1406
1407         if (fi) {
1408                 struct fib_nh *nh = &FIB_RES_NH(*res);
1409
1410                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1411                         rt->rt_gateway = nh->nh_gw;
1412                         rt->rt_uses_gateway = 1;
1413                 }
1414                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1415 #ifdef CONFIG_IP_ROUTE_CLASSID
1416                 rt->dst.tclassid = nh->nh_tclassid;
1417 #endif
1418                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1419                 if (unlikely(fnhe))
1420                         cached = rt_bind_exception(rt, fnhe, daddr);
1421                 else if (!(rt->dst.flags & DST_NOCACHE))
1422                         cached = rt_cache_route(nh, rt);
1423                 if (unlikely(!cached)) {
1424                         /* Routes we intend to cache in nexthop exception or
1425                          * FIB nexthop have the DST_NOCACHE bit clear.
1426                          * However, if we are unsuccessful at storing this
1427                          * route into the cache we really need to set it.
1428                          */
1429                         rt->dst.flags |= DST_NOCACHE;
1430                         if (!rt->rt_gateway)
1431                                 rt->rt_gateway = daddr;
1432                         rt_add_uncached_list(rt);
1433                 }
1434         } else
1435                 rt_add_uncached_list(rt);
1436
1437 #ifdef CONFIG_IP_ROUTE_CLASSID
1438 #ifdef CONFIG_IP_MULTIPLE_TABLES
1439         set_class_tag(rt, res->tclassid);
1440 #endif
1441         set_class_tag(rt, itag);
1442 #endif
1443 }
1444
1445 static struct rtable *rt_dst_alloc(struct net_device *dev,
1446                                    unsigned int flags, u16 type,
1447                                    bool nopolicy, bool noxfrm, bool will_cache)
1448 {
1449         struct rtable *rt;
1450
1451         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1452                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1453                        (nopolicy ? DST_NOPOLICY : 0) |
1454                        (noxfrm ? DST_NOXFRM : 0));
1455
1456         if (rt) {
1457                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1458                 rt->rt_flags = flags;
1459                 rt->rt_type = type;
1460                 rt->rt_is_input = 0;
1461                 rt->rt_iif = 0;
1462                 rt->rt_pmtu = 0;
1463                 rt->rt_gateway = 0;
1464                 rt->rt_uses_gateway = 0;
1465                 rt->rt_table_id = 0;
1466                 INIT_LIST_HEAD(&rt->rt_uncached);
1467
1468                 rt->dst.output = ip_output;
1469                 if (flags & RTCF_LOCAL)
1470                         rt->dst.input = ip_local_deliver;
1471         }
1472
1473         return rt;
1474 }
1475
1476 /* called in rcu_read_lock() section */
1477 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1478                                 u8 tos, struct net_device *dev, int our)
1479 {
1480         struct rtable *rth;
1481         struct in_device *in_dev = __in_dev_get_rcu(dev);
1482         unsigned int flags = RTCF_MULTICAST;
1483         u32 itag = 0;
1484         int err;
1485
1486         /* Primary sanity checks. */
1487
1488         if (!in_dev)
1489                 return -EINVAL;
1490
1491         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1492             skb->protocol != htons(ETH_P_IP))
1493                 goto e_inval;
1494
1495         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1496                 goto e_inval;
1497
1498         if (ipv4_is_zeronet(saddr)) {
1499                 if (!ipv4_is_local_multicast(daddr))
1500                         goto e_inval;
1501         } else {
1502                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1503                                           in_dev, &itag);
1504                 if (err < 0)
1505                         goto e_err;
1506         }
1507         if (our)
1508                 flags |= RTCF_LOCAL;
1509
1510         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1511                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1512         if (!rth)
1513                 goto e_nobufs;
1514
1515 #ifdef CONFIG_IP_ROUTE_CLASSID
1516         rth->dst.tclassid = itag;
1517 #endif
1518         rth->dst.output = ip_rt_bug;
1519         rth->rt_is_input= 1;
1520
1521 #ifdef CONFIG_IP_MROUTE
1522         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1523                 rth->dst.input = ip_mr_input;
1524 #endif
1525         RT_CACHE_STAT_INC(in_slow_mc);
1526
1527         skb_dst_set(skb, &rth->dst);
1528         return 0;
1529
1530 e_nobufs:
1531         return -ENOBUFS;
1532 e_inval:
1533         return -EINVAL;
1534 e_err:
1535         return err;
1536 }
1537
1538
1539 static void ip_handle_martian_source(struct net_device *dev,
1540                                      struct in_device *in_dev,
1541                                      struct sk_buff *skb,
1542                                      __be32 daddr,
1543                                      __be32 saddr)
1544 {
1545         RT_CACHE_STAT_INC(in_martian_src);
1546 #ifdef CONFIG_IP_ROUTE_VERBOSE
1547         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1548                 /*
1549                  *      RFC1812 recommendation, if source is martian,
1550                  *      the only hint is MAC header.
1551                  */
1552                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1553                         &daddr, &saddr, dev->name);
1554                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1555                         print_hex_dump(KERN_WARNING, "ll header: ",
1556                                        DUMP_PREFIX_OFFSET, 16, 1,
1557                                        skb_mac_header(skb),
1558                                        dev->hard_header_len, true);
1559                 }
1560         }
1561 #endif
1562 }
1563
1564 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1565 {
1566         struct fnhe_hash_bucket *hash;
1567         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1568         u32 hval = fnhe_hashfun(daddr);
1569
1570         spin_lock_bh(&fnhe_lock);
1571
1572         hash = rcu_dereference_protected(nh->nh_exceptions,
1573                                          lockdep_is_held(&fnhe_lock));
1574         hash += hval;
1575
1576         fnhe_p = &hash->chain;
1577         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1578         while (fnhe) {
1579                 if (fnhe->fnhe_daddr == daddr) {
1580                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1581                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1582                         fnhe_flush_routes(fnhe);
1583                         kfree_rcu(fnhe, rcu);
1584                         break;
1585                 }
1586                 fnhe_p = &fnhe->fnhe_next;
1587                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1588                                                  lockdep_is_held(&fnhe_lock));
1589         }
1590
1591         spin_unlock_bh(&fnhe_lock);
1592 }
1593
1594 /* called in rcu_read_lock() section */
1595 static int __mkroute_input(struct sk_buff *skb,
1596                            const struct fib_result *res,
1597                            struct in_device *in_dev,
1598                            __be32 daddr, __be32 saddr, u32 tos)
1599 {
1600         struct fib_nh_exception *fnhe;
1601         struct rtable *rth;
1602         int err;
1603         struct in_device *out_dev;
1604         bool do_cache;
1605         u32 itag = 0;
1606
1607         /* get a working reference to the output device */
1608         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1609         if (!out_dev) {
1610                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1611                 return -EINVAL;
1612         }
1613
1614         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1615                                   in_dev->dev, in_dev, &itag);
1616         if (err < 0) {
1617                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1618                                          saddr);
1619
1620                 goto cleanup;
1621         }
1622
1623         do_cache = res->fi && !itag;
1624         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1625             skb->protocol == htons(ETH_P_IP) &&
1626             (IN_DEV_SHARED_MEDIA(out_dev) ||
1627              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1628                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1629
1630         if (skb->protocol != htons(ETH_P_IP)) {
1631                 /* Not IP (i.e. ARP). Do not create route, if it is
1632                  * invalid for proxy arp. DNAT routes are always valid.
1633                  *
1634                  * Proxy arp feature have been extended to allow, ARP
1635                  * replies back to the same interface, to support
1636                  * Private VLAN switch technologies. See arp.c.
1637                  */
1638                 if (out_dev == in_dev &&
1639                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1640                         err = -EINVAL;
1641                         goto cleanup;
1642                 }
1643         }
1644
1645         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1646         if (do_cache) {
1647                 if (fnhe) {
1648                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1649                         if (rth && rth->dst.expires &&
1650                             time_after(jiffies, rth->dst.expires)) {
1651                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1652                                 fnhe = NULL;
1653                         } else {
1654                                 goto rt_cache;
1655                         }
1656                 }
1657
1658                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1659
1660 rt_cache:
1661                 if (rt_cache_valid(rth)) {
1662                         skb_dst_set_noref(skb, &rth->dst);
1663                         goto out;
1664                 }
1665         }
1666
1667         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1668                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1669                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1670         if (!rth) {
1671                 err = -ENOBUFS;
1672                 goto cleanup;
1673         }
1674
1675         rth->rt_is_input = 1;
1676         if (res->table)
1677                 rth->rt_table_id = res->table->tb_id;
1678         RT_CACHE_STAT_INC(in_slow_tot);
1679
1680         rth->dst.input = ip_forward;
1681
1682         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1683         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1684                 rth->dst.lwtstate->orig_output = rth->dst.output;
1685                 rth->dst.output = lwtunnel_output;
1686         }
1687         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1688                 rth->dst.lwtstate->orig_input = rth->dst.input;
1689                 rth->dst.input = lwtunnel_input;
1690         }
1691         skb_dst_set(skb, &rth->dst);
1692 out:
1693         err = 0;
1694  cleanup:
1695         return err;
1696 }
1697
1698 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1699
1700 /* To make ICMP packets follow the right flow, the multipath hash is
1701  * calculated from the inner IP addresses in reverse order.
1702  */
1703 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1704 {
1705         const struct iphdr *outer_iph = ip_hdr(skb);
1706         struct icmphdr _icmph;
1707         const struct icmphdr *icmph;
1708         struct iphdr _inner_iph;
1709         const struct iphdr *inner_iph;
1710
1711         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1712                 goto standard_hash;
1713
1714         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1715                                    &_icmph);
1716         if (!icmph)
1717                 goto standard_hash;
1718
1719         if (icmph->type != ICMP_DEST_UNREACH &&
1720             icmph->type != ICMP_REDIRECT &&
1721             icmph->type != ICMP_TIME_EXCEEDED &&
1722             icmph->type != ICMP_PARAMETERPROB) {
1723                 goto standard_hash;
1724         }
1725
1726         inner_iph = skb_header_pointer(skb,
1727                                        outer_iph->ihl * 4 + sizeof(_icmph),
1728                                        sizeof(_inner_iph), &_inner_iph);
1729         if (!inner_iph)
1730                 goto standard_hash;
1731
1732         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1733
1734 standard_hash:
1735         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1736 }
1737
1738 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1739
1740 static int ip_mkroute_input(struct sk_buff *skb,
1741                             struct fib_result *res,
1742                             const struct flowi4 *fl4,
1743                             struct in_device *in_dev,
1744                             __be32 daddr, __be32 saddr, u32 tos)
1745 {
1746 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1747         if (res->fi && res->fi->fib_nhs > 1) {
1748                 int h;
1749
1750                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1751                         h = ip_multipath_icmp_hash(skb);
1752                 else
1753                         h = fib_multipath_hash(saddr, daddr);
1754                 fib_select_multipath(res, h);
1755         }
1756 #endif
1757
1758         /* create a routing cache entry */
1759         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1760 }
1761
1762 /*
1763  *      NOTE. We drop all the packets that has local source
1764  *      addresses, because every properly looped back packet
1765  *      must have correct destination already attached by output routine.
1766  *
1767  *      Such approach solves two big problems:
1768  *      1. Not simplex devices are handled properly.
1769  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1770  *      called with rcu_read_lock()
1771  */
1772
1773 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1774                                u8 tos, struct net_device *dev)
1775 {
1776         struct fib_result res;
1777         struct in_device *in_dev = __in_dev_get_rcu(dev);
1778         struct ip_tunnel_info *tun_info;
1779         struct flowi4   fl4;
1780         unsigned int    flags = 0;
1781         u32             itag = 0;
1782         struct rtable   *rth;
1783         int             err = -EINVAL;
1784         struct net    *net = dev_net(dev);
1785         bool do_cache;
1786
1787         /* IP on this device is disabled. */
1788
1789         if (!in_dev)
1790                 goto out;
1791
1792         /* Check for the most weird martians, which can be not detected
1793            by fib_lookup.
1794          */
1795
1796         tun_info = skb_tunnel_info(skb);
1797         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1798                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1799         else
1800                 fl4.flowi4_tun_key.tun_id = 0;
1801         skb_dst_drop(skb);
1802
1803         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1804                 goto martian_source;
1805
1806         res.fi = NULL;
1807         res.table = NULL;
1808         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1809                 goto brd_input;
1810
1811         /* Accept zero addresses only to limited broadcast;
1812          * I even do not know to fix it or not. Waiting for complains :-)
1813          */
1814         if (ipv4_is_zeronet(saddr))
1815                 goto martian_source;
1816
1817         if (ipv4_is_zeronet(daddr))
1818                 goto martian_destination;
1819
1820         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1821          * and call it once if daddr or/and saddr are loopback addresses
1822          */
1823         if (ipv4_is_loopback(daddr)) {
1824                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1825                         goto martian_destination;
1826         } else if (ipv4_is_loopback(saddr)) {
1827                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1828                         goto martian_source;
1829         }
1830
1831         /*
1832          *      Now we are ready to route packet.
1833          */
1834         fl4.flowi4_oif = 0;
1835         fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1836         fl4.flowi4_mark = skb->mark;
1837         fl4.flowi4_tos = tos;
1838         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1839         fl4.flowi4_flags = 0;
1840         fl4.daddr = daddr;
1841         fl4.saddr = saddr;
1842         err = fib_lookup(net, &fl4, &res, 0);
1843         if (err != 0) {
1844                 if (!IN_DEV_FORWARD(in_dev))
1845                         err = -EHOSTUNREACH;
1846                 goto no_route;
1847         }
1848
1849         if (res.type == RTN_BROADCAST)
1850                 goto brd_input;
1851
1852         if (res.type == RTN_LOCAL) {
1853                 err = fib_validate_source(skb, saddr, daddr, tos,
1854                                           0, dev, in_dev, &itag);
1855                 if (err < 0)
1856                         goto martian_source;
1857                 goto local_input;
1858         }
1859
1860         if (!IN_DEV_FORWARD(in_dev)) {
1861                 err = -EHOSTUNREACH;
1862                 goto no_route;
1863         }
1864         if (res.type != RTN_UNICAST)
1865                 goto martian_destination;
1866
1867         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1868 out:    return err;
1869
1870 brd_input:
1871         if (skb->protocol != htons(ETH_P_IP))
1872                 goto e_inval;
1873
1874         if (!ipv4_is_zeronet(saddr)) {
1875                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1876                                           in_dev, &itag);
1877                 if (err < 0)
1878                         goto martian_source;
1879         }
1880         flags |= RTCF_BROADCAST;
1881         res.type = RTN_BROADCAST;
1882         RT_CACHE_STAT_INC(in_brd);
1883
1884 local_input:
1885         do_cache = false;
1886         if (res.fi) {
1887                 if (!itag) {
1888                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1889                         if (rt_cache_valid(rth)) {
1890                                 skb_dst_set_noref(skb, &rth->dst);
1891                                 err = 0;
1892                                 goto out;
1893                         }
1894                         do_cache = true;
1895                 }
1896         }
1897
1898         rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1899                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1900         if (!rth)
1901                 goto e_nobufs;
1902
1903         rth->dst.output= ip_rt_bug;
1904 #ifdef CONFIG_IP_ROUTE_CLASSID
1905         rth->dst.tclassid = itag;
1906 #endif
1907         rth->rt_is_input = 1;
1908         if (res.table)
1909                 rth->rt_table_id = res.table->tb_id;
1910
1911         RT_CACHE_STAT_INC(in_slow_tot);
1912         if (res.type == RTN_UNREACHABLE) {
1913                 rth->dst.input= ip_error;
1914                 rth->dst.error= -err;
1915                 rth->rt_flags   &= ~RTCF_LOCAL;
1916         }
1917         if (do_cache) {
1918                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1919                         rth->dst.flags |= DST_NOCACHE;
1920                         rt_add_uncached_list(rth);
1921                 }
1922         }
1923         skb_dst_set(skb, &rth->dst);
1924         err = 0;
1925         goto out;
1926
1927 no_route:
1928         RT_CACHE_STAT_INC(in_no_route);
1929         res.type = RTN_UNREACHABLE;
1930         res.fi = NULL;
1931         res.table = NULL;
1932         goto local_input;
1933
1934         /*
1935          *      Do not cache martian addresses: they should be logged (RFC1812)
1936          */
1937 martian_destination:
1938         RT_CACHE_STAT_INC(in_martian_dst);
1939 #ifdef CONFIG_IP_ROUTE_VERBOSE
1940         if (IN_DEV_LOG_MARTIANS(in_dev))
1941                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1942                                      &daddr, &saddr, dev->name);
1943 #endif
1944
1945 e_inval:
1946         err = -EINVAL;
1947         goto out;
1948
1949 e_nobufs:
1950         err = -ENOBUFS;
1951         goto out;
1952
1953 martian_source:
1954         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1955         goto out;
1956 }
1957
1958 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1959                          u8 tos, struct net_device *dev)
1960 {
1961         int res;
1962
1963         rcu_read_lock();
1964
1965         /* Multicast recognition logic is moved from route cache to here.
1966            The problem was that too many Ethernet cards have broken/missing
1967            hardware multicast filters :-( As result the host on multicasting
1968            network acquires a lot of useless route cache entries, sort of
1969            SDR messages from all the world. Now we try to get rid of them.
1970            Really, provided software IP multicast filter is organized
1971            reasonably (at least, hashed), it does not result in a slowdown
1972            comparing with route cache reject entries.
1973            Note, that multicast routers are not affected, because
1974            route cache entry is created eventually.
1975          */
1976         if (ipv4_is_multicast(daddr)) {
1977                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1978
1979                 if (in_dev) {
1980                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1981                                                   ip_hdr(skb)->protocol);
1982                         if (our
1983 #ifdef CONFIG_IP_MROUTE
1984                                 ||
1985                             (!ipv4_is_local_multicast(daddr) &&
1986                              IN_DEV_MFORWARD(in_dev))
1987 #endif
1988                            ) {
1989                                 int res = ip_route_input_mc(skb, daddr, saddr,
1990                                                             tos, dev, our);
1991                                 rcu_read_unlock();
1992                                 return res;
1993                         }
1994                 }
1995                 rcu_read_unlock();
1996                 return -EINVAL;
1997         }
1998         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1999         rcu_read_unlock();
2000         return res;
2001 }
2002 EXPORT_SYMBOL(ip_route_input_noref);
2003
2004 /* called with rcu_read_lock() */
2005 static struct rtable *__mkroute_output(const struct fib_result *res,
2006                                        const struct flowi4 *fl4, int orig_oif,
2007                                        struct net_device *dev_out,
2008                                        unsigned int flags)
2009 {
2010         struct fib_info *fi = res->fi;
2011         struct fib_nh_exception *fnhe;
2012         struct in_device *in_dev;
2013         u16 type = res->type;
2014         struct rtable *rth;
2015         bool do_cache;
2016
2017         in_dev = __in_dev_get_rcu(dev_out);
2018         if (!in_dev)
2019                 return ERR_PTR(-EINVAL);
2020
2021         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2022                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2023                         return ERR_PTR(-EINVAL);
2024
2025         if (ipv4_is_lbcast(fl4->daddr))
2026                 type = RTN_BROADCAST;
2027         else if (ipv4_is_multicast(fl4->daddr))
2028                 type = RTN_MULTICAST;
2029         else if (ipv4_is_zeronet(fl4->daddr))
2030                 return ERR_PTR(-EINVAL);
2031
2032         if (dev_out->flags & IFF_LOOPBACK)
2033                 flags |= RTCF_LOCAL;
2034
2035         do_cache = true;
2036         if (type == RTN_BROADCAST) {
2037                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2038                 fi = NULL;
2039         } else if (type == RTN_MULTICAST) {
2040                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2041                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2042                                      fl4->flowi4_proto))
2043                         flags &= ~RTCF_LOCAL;
2044                 else
2045                         do_cache = false;
2046                 /* If multicast route do not exist use
2047                  * default one, but do not gateway in this case.
2048                  * Yes, it is hack.
2049                  */
2050                 if (fi && res->prefixlen < 4)
2051                         fi = NULL;
2052         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2053                    (orig_oif != dev_out->ifindex)) {
2054                 /* For local routes that require a particular output interface
2055                  * we do not want to cache the result.  Caching the result
2056                  * causes incorrect behaviour when there are multiple source
2057                  * addresses on the interface, the end result being that if the
2058                  * intended recipient is waiting on that interface for the
2059                  * packet he won't receive it because it will be delivered on
2060                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2061                  * be set to the loopback interface as well.
2062                  */
2063                 fi = NULL;
2064         }
2065
2066         fnhe = NULL;
2067         do_cache &= fi != NULL;
2068         if (do_cache) {
2069                 struct rtable __rcu **prth;
2070                 struct fib_nh *nh = &FIB_RES_NH(*res);
2071
2072                 fnhe = find_exception(nh, fl4->daddr);
2073                 if (fnhe) {
2074                         prth = &fnhe->fnhe_rth_output;
2075                         rth = rcu_dereference(*prth);
2076                         if (rth && rth->dst.expires &&
2077                             time_after(jiffies, rth->dst.expires)) {
2078                                 ip_del_fnhe(nh, fl4->daddr);
2079                                 fnhe = NULL;
2080                         } else {
2081                                 goto rt_cache;
2082                         }
2083                 }
2084
2085                 if (unlikely(fl4->flowi4_flags &
2086                              FLOWI_FLAG_KNOWN_NH &&
2087                              !(nh->nh_gw &&
2088                                nh->nh_scope == RT_SCOPE_LINK))) {
2089                         do_cache = false;
2090                         goto add;
2091                 }
2092                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2093                 rth = rcu_dereference(*prth);
2094
2095 rt_cache:
2096                 if (rt_cache_valid(rth)) {
2097                         dst_hold(&rth->dst);
2098                         return rth;
2099                 }
2100         }
2101
2102 add:
2103         rth = rt_dst_alloc(dev_out, flags, type,
2104                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2105                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2106                            do_cache);
2107         if (!rth)
2108                 return ERR_PTR(-ENOBUFS);
2109
2110         rth->rt_iif     = orig_oif ? : 0;
2111         if (res->table)
2112                 rth->rt_table_id = res->table->tb_id;
2113
2114         RT_CACHE_STAT_INC(out_slow_tot);
2115
2116         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2117                 if (flags & RTCF_LOCAL &&
2118                     !(dev_out->flags & IFF_LOOPBACK)) {
2119                         rth->dst.output = ip_mc_output;
2120                         RT_CACHE_STAT_INC(out_slow_mc);
2121                 }
2122 #ifdef CONFIG_IP_MROUTE
2123                 if (type == RTN_MULTICAST) {
2124                         if (IN_DEV_MFORWARD(in_dev) &&
2125                             !ipv4_is_local_multicast(fl4->daddr)) {
2126                                 rth->dst.input = ip_mr_input;
2127                                 rth->dst.output = ip_mc_output;
2128                         }
2129                 }
2130 #endif
2131         }
2132
2133         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2134         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2135                 rth->dst.output = lwtunnel_output;
2136
2137         return rth;
2138 }
2139
2140 /*
2141  * Major route resolver routine.
2142  */
2143
2144 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2145                                           int mp_hash)
2146 {
2147         struct net_device *dev_out = NULL;
2148         __u8 tos = RT_FL_TOS(fl4);
2149         unsigned int flags = 0;
2150         struct fib_result res;
2151         struct rtable *rth;
2152         int orig_oif;
2153         int err = -ENETUNREACH;
2154
2155         res.tclassid    = 0;
2156         res.fi          = NULL;
2157         res.table       = NULL;
2158
2159         orig_oif = fl4->flowi4_oif;
2160
2161         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2162         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2163         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2164                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2165
2166         rcu_read_lock();
2167         if (fl4->saddr) {
2168                 rth = ERR_PTR(-EINVAL);
2169                 if (ipv4_is_multicast(fl4->saddr) ||
2170                     ipv4_is_lbcast(fl4->saddr) ||
2171                     ipv4_is_zeronet(fl4->saddr))
2172                         goto out;
2173
2174                 /* I removed check for oif == dev_out->oif here.
2175                    It was wrong for two reasons:
2176                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2177                       is assigned to multiple interfaces.
2178                    2. Moreover, we are allowed to send packets with saddr
2179                       of another iface. --ANK
2180                  */
2181
2182                 if (fl4->flowi4_oif == 0 &&
2183                     (ipv4_is_multicast(fl4->daddr) ||
2184                      ipv4_is_lbcast(fl4->daddr))) {
2185                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2186                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2187                         if (!dev_out)
2188                                 goto out;
2189
2190                         /* Special hack: user can direct multicasts
2191                            and limited broadcast via necessary interface
2192                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2193                            This hack is not just for fun, it allows
2194                            vic,vat and friends to work.
2195                            They bind socket to loopback, set ttl to zero
2196                            and expect that it will work.
2197                            From the viewpoint of routing cache they are broken,
2198                            because we are not allowed to build multicast path
2199                            with loopback source addr (look, routing cache
2200                            cannot know, that ttl is zero, so that packet
2201                            will not leave this host and route is valid).
2202                            Luckily, this hack is good workaround.
2203                          */
2204
2205                         fl4->flowi4_oif = dev_out->ifindex;
2206                         goto make_route;
2207                 }
2208
2209                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2210                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2211                         if (!__ip_dev_find(net, fl4->saddr, false))
2212                                 goto out;
2213                 }
2214         }
2215
2216
2217         if (fl4->flowi4_oif) {
2218                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2219                 rth = ERR_PTR(-ENODEV);
2220                 if (!dev_out)
2221                         goto out;
2222
2223                 /* RACE: Check return value of inet_select_addr instead. */
2224                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2225                         rth = ERR_PTR(-ENETUNREACH);
2226                         goto out;
2227                 }
2228                 if (ipv4_is_local_multicast(fl4->daddr) ||
2229                     ipv4_is_lbcast(fl4->daddr) ||
2230                     fl4->flowi4_proto == IPPROTO_IGMP) {
2231                         if (!fl4->saddr)
2232                                 fl4->saddr = inet_select_addr(dev_out, 0,
2233                                                               RT_SCOPE_LINK);
2234                         goto make_route;
2235                 }
2236                 if (!fl4->saddr) {
2237                         if (ipv4_is_multicast(fl4->daddr))
2238                                 fl4->saddr = inet_select_addr(dev_out, 0,
2239                                                               fl4->flowi4_scope);
2240                         else if (!fl4->daddr)
2241                                 fl4->saddr = inet_select_addr(dev_out, 0,
2242                                                               RT_SCOPE_HOST);
2243                 }
2244
2245                 rth = l3mdev_get_rtable(dev_out, fl4);
2246                 if (rth)
2247                         goto out;
2248         }
2249
2250         if (!fl4->daddr) {
2251                 fl4->daddr = fl4->saddr;
2252                 if (!fl4->daddr)
2253                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2254                 dev_out = net->loopback_dev;
2255                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2256                 res.type = RTN_LOCAL;
2257                 flags |= RTCF_LOCAL;
2258                 goto make_route;
2259         }
2260
2261         err = fib_lookup(net, fl4, &res, 0);
2262         if (err) {
2263                 res.fi = NULL;
2264                 res.table = NULL;
2265                 if (fl4->flowi4_oif &&
2266                     !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2267                         /* Apparently, routing tables are wrong. Assume,
2268                            that the destination is on link.
2269
2270                            WHY? DW.
2271                            Because we are allowed to send to iface
2272                            even if it has NO routes and NO assigned
2273                            addresses. When oif is specified, routing
2274                            tables are looked up with only one purpose:
2275                            to catch if destination is gatewayed, rather than
2276                            direct. Moreover, if MSG_DONTROUTE is set,
2277                            we send packet, ignoring both routing tables
2278                            and ifaddr state. --ANK
2279
2280
2281                            We could make it even if oif is unknown,
2282                            likely IPv6, but we do not.
2283                          */
2284
2285                         if (fl4->saddr == 0)
2286                                 fl4->saddr = inet_select_addr(dev_out, 0,
2287                                                               RT_SCOPE_LINK);
2288                         res.type = RTN_UNICAST;
2289                         goto make_route;
2290                 }
2291                 rth = ERR_PTR(err);
2292                 goto out;
2293         }
2294
2295         if (res.type == RTN_LOCAL) {
2296                 if (!fl4->saddr) {
2297                         if (res.fi->fib_prefsrc)
2298                                 fl4->saddr = res.fi->fib_prefsrc;
2299                         else
2300                                 fl4->saddr = fl4->daddr;
2301                 }
2302                 dev_out = net->loopback_dev;
2303                 fl4->flowi4_oif = dev_out->ifindex;
2304                 flags |= RTCF_LOCAL;
2305                 goto make_route;
2306         }
2307
2308         fib_select_path(net, &res, fl4, mp_hash);
2309
2310         dev_out = FIB_RES_DEV(res);
2311         fl4->flowi4_oif = dev_out->ifindex;
2312
2313
2314 make_route:
2315         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2316
2317 out:
2318         rcu_read_unlock();
2319         return rth;
2320 }
2321 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2322
2323 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2324 {
2325         return NULL;
2326 }
2327
2328 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2329 {
2330         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2331
2332         return mtu ? : dst->dev->mtu;
2333 }
2334
2335 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2336                                           struct sk_buff *skb, u32 mtu)
2337 {
2338 }
2339
2340 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2341                                        struct sk_buff *skb)
2342 {
2343 }
2344
2345 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2346                                           unsigned long old)
2347 {
2348         return NULL;
2349 }
2350
2351 static struct dst_ops ipv4_dst_blackhole_ops = {
2352         .family                 =       AF_INET,
2353         .check                  =       ipv4_blackhole_dst_check,
2354         .mtu                    =       ipv4_blackhole_mtu,
2355         .default_advmss         =       ipv4_default_advmss,
2356         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2357         .redirect               =       ipv4_rt_blackhole_redirect,
2358         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2359         .neigh_lookup           =       ipv4_neigh_lookup,
2360 };
2361
2362 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2363 {
2364         struct rtable *ort = (struct rtable *) dst_orig;
2365         struct rtable *rt;
2366
2367         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2368         if (rt) {
2369                 struct dst_entry *new = &rt->dst;
2370
2371                 new->__use = 1;
2372                 new->input = dst_discard;
2373                 new->output = dst_discard_out;
2374
2375                 new->dev = ort->dst.dev;
2376                 if (new->dev)
2377                         dev_hold(new->dev);
2378
2379                 rt->rt_is_input = ort->rt_is_input;
2380                 rt->rt_iif = ort->rt_iif;
2381                 rt->rt_pmtu = ort->rt_pmtu;
2382
2383                 rt->rt_genid = rt_genid_ipv4(net);
2384                 rt->rt_flags = ort->rt_flags;
2385                 rt->rt_type = ort->rt_type;
2386                 rt->rt_gateway = ort->rt_gateway;
2387                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2388
2389                 INIT_LIST_HEAD(&rt->rt_uncached);
2390                 dst_free(new);
2391         }
2392
2393         dst_release(dst_orig);
2394
2395         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2396 }
2397
2398 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2399                                     const struct sock *sk)
2400 {
2401         struct rtable *rt = __ip_route_output_key(net, flp4);
2402
2403         if (IS_ERR(rt))
2404                 return rt;
2405
2406         if (flp4->flowi4_proto)
2407                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2408                                                         flowi4_to_flowi(flp4),
2409                                                         sk, 0);
2410
2411         return rt;
2412 }
2413 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2414
2415 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2416                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2417                         u32 seq, int event, int nowait, unsigned int flags)
2418 {
2419         struct rtable *rt = skb_rtable(skb);
2420         struct rtmsg *r;
2421         struct nlmsghdr *nlh;
2422         unsigned long expires = 0;
2423         u32 error;
2424         u32 metrics[RTAX_MAX];
2425
2426         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2427         if (!nlh)
2428                 return -EMSGSIZE;
2429
2430         r = nlmsg_data(nlh);
2431         r->rtm_family    = AF_INET;
2432         r->rtm_dst_len  = 32;
2433         r->rtm_src_len  = 0;
2434         r->rtm_tos      = fl4->flowi4_tos;
2435         r->rtm_table    = table_id;
2436         if (nla_put_u32(skb, RTA_TABLE, table_id))
2437                 goto nla_put_failure;
2438         r->rtm_type     = rt->rt_type;
2439         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2440         r->rtm_protocol = RTPROT_UNSPEC;
2441         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2442         if (rt->rt_flags & RTCF_NOTIFY)
2443                 r->rtm_flags |= RTM_F_NOTIFY;
2444         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2445                 r->rtm_flags |= RTCF_DOREDIRECT;
2446
2447         if (nla_put_in_addr(skb, RTA_DST, dst))
2448                 goto nla_put_failure;
2449         if (src) {
2450                 r->rtm_src_len = 32;
2451                 if (nla_put_in_addr(skb, RTA_SRC, src))
2452                         goto nla_put_failure;
2453         }
2454         if (rt->dst.dev &&
2455             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2456                 goto nla_put_failure;
2457 #ifdef CONFIG_IP_ROUTE_CLASSID
2458         if (rt->dst.tclassid &&
2459             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2460                 goto nla_put_failure;
2461 #endif
2462         if (!rt_is_input_route(rt) &&
2463             fl4->saddr != src) {
2464                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2465                         goto nla_put_failure;
2466         }
2467         if (rt->rt_uses_gateway &&
2468             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2469                 goto nla_put_failure;
2470
2471         expires = rt->dst.expires;
2472         if (expires) {
2473                 unsigned long now = jiffies;
2474
2475                 if (time_before(now, expires))
2476                         expires -= now;
2477                 else
2478                         expires = 0;
2479         }
2480
2481         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2482         if (rt->rt_pmtu && expires)
2483                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2484         if (rtnetlink_put_metrics(skb, metrics) < 0)
2485                 goto nla_put_failure;
2486
2487         if (fl4->flowi4_mark &&
2488             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2489                 goto nla_put_failure;
2490
2491         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2492             nla_put_u32(skb, RTA_UID,
2493                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2494                 goto nla_put_failure;
2495
2496         error = rt->dst.error;
2497
2498         if (rt_is_input_route(rt)) {
2499 #ifdef CONFIG_IP_MROUTE
2500                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2501                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2502                         int err = ipmr_get_route(net, skb,
2503                                                  fl4->saddr, fl4->daddr,
2504                                                  r, nowait, portid);
2505
2506                         if (err <= 0) {
2507                                 if (!nowait) {
2508                                         if (err == 0)
2509                                                 return 0;
2510                                         goto nla_put_failure;
2511                                 } else {
2512                                         if (err == -EMSGSIZE)
2513                                                 goto nla_put_failure;
2514                                         error = err;
2515                                 }
2516                         }
2517                 } else
2518 #endif
2519                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2520                                 goto nla_put_failure;
2521         }
2522
2523         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2524                 goto nla_put_failure;
2525
2526         nlmsg_end(skb, nlh);
2527         return 0;
2528
2529 nla_put_failure:
2530         nlmsg_cancel(skb, nlh);
2531         return -EMSGSIZE;
2532 }
2533
2534 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2535 {
2536         struct net *net = sock_net(in_skb->sk);
2537         struct rtmsg *rtm;
2538         struct nlattr *tb[RTA_MAX+1];
2539         struct rtable *rt = NULL;
2540         struct flowi4 fl4;
2541         __be32 dst = 0;
2542         __be32 src = 0;
2543         u32 iif;
2544         int err;
2545         int mark;
2546         struct sk_buff *skb;
2547         u32 table_id = RT_TABLE_MAIN;
2548         kuid_t uid;
2549
2550         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2551         if (err < 0)
2552                 goto errout;
2553
2554         rtm = nlmsg_data(nlh);
2555
2556         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2557         if (!skb) {
2558                 err = -ENOBUFS;
2559                 goto errout;
2560         }
2561
2562         /* Reserve room for dummy headers, this skb can pass
2563            through good chunk of routing engine.
2564          */
2565         skb_reset_mac_header(skb);
2566         skb_reset_network_header(skb);
2567
2568         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2569         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2570         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2571
2572         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2573         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2574         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2575         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2576         if (tb[RTA_UID])
2577                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2578         else
2579                 uid = (iif ? INVALID_UID : current_uid());
2580
2581         memset(&fl4, 0, sizeof(fl4));
2582         fl4.daddr = dst;
2583         fl4.saddr = src;
2584         fl4.flowi4_tos = rtm->rtm_tos;
2585         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2586         fl4.flowi4_mark = mark;
2587         fl4.flowi4_uid = uid;
2588
2589         if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2590                 fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2591
2592         if (iif) {
2593                 struct net_device *dev;
2594
2595                 dev = __dev_get_by_index(net, iif);
2596                 if (!dev) {
2597                         err = -ENODEV;
2598                         goto errout_free;
2599                 }
2600
2601                 skb->protocol   = htons(ETH_P_IP);
2602                 skb->dev        = dev;
2603                 skb->mark       = mark;
2604                 local_bh_disable();
2605                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2606                 local_bh_enable();
2607
2608                 rt = skb_rtable(skb);
2609                 if (err == 0 && rt->dst.error)
2610                         err = -rt->dst.error;
2611         } else {
2612                 rt = ip_route_output_key(net, &fl4);
2613
2614                 err = 0;
2615                 if (IS_ERR(rt))
2616                         err = PTR_ERR(rt);
2617         }
2618
2619         if (err)
2620                 goto errout_free;
2621
2622         skb_dst_set(skb, &rt->dst);
2623         if (rtm->rtm_flags & RTM_F_NOTIFY)
2624                 rt->rt_flags |= RTCF_NOTIFY;
2625
2626         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2627                 table_id = rt->rt_table_id;
2628
2629         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2630                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2631                            RTM_NEWROUTE, 0, 0);
2632         if (err < 0)
2633                 goto errout_free;
2634
2635         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2636 errout:
2637         return err;
2638
2639 errout_free:
2640         kfree_skb(skb);
2641         goto errout;
2642 }
2643
2644 void ip_rt_multicast_event(struct in_device *in_dev)
2645 {
2646         rt_cache_flush(dev_net(in_dev->dev));
2647 }
2648
2649 #ifdef CONFIG_SYSCTL
2650 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2651 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2652 static int ip_rt_gc_elasticity __read_mostly    = 8;
2653
2654 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2655                                         void __user *buffer,
2656                                         size_t *lenp, loff_t *ppos)
2657 {
2658         struct net *net = (struct net *)__ctl->extra1;
2659
2660         if (write) {
2661                 rt_cache_flush(net);
2662                 fnhe_genid_bump(net);
2663                 return 0;
2664         }
2665
2666         return -EINVAL;
2667 }
2668
2669 static struct ctl_table ipv4_route_table[] = {
2670         {
2671                 .procname       = "gc_thresh",
2672                 .data           = &ipv4_dst_ops.gc_thresh,
2673                 .maxlen         = sizeof(int),
2674                 .mode           = 0644,
2675                 .proc_handler   = proc_dointvec,
2676         },
2677         {
2678                 .procname       = "max_size",
2679                 .data           = &ip_rt_max_size,
2680                 .maxlen         = sizeof(int),
2681                 .mode           = 0644,
2682                 .proc_handler   = proc_dointvec,
2683         },
2684         {
2685                 /*  Deprecated. Use gc_min_interval_ms */
2686
2687                 .procname       = "gc_min_interval",
2688                 .data           = &ip_rt_gc_min_interval,
2689                 .maxlen         = sizeof(int),
2690                 .mode           = 0644,
2691                 .proc_handler   = proc_dointvec_jiffies,
2692         },
2693         {
2694                 .procname       = "gc_min_interval_ms",
2695                 .data           = &ip_rt_gc_min_interval,
2696                 .maxlen         = sizeof(int),
2697                 .mode           = 0644,
2698                 .proc_handler   = proc_dointvec_ms_jiffies,
2699         },
2700         {
2701                 .procname       = "gc_timeout",
2702                 .data           = &ip_rt_gc_timeout,
2703                 .maxlen         = sizeof(int),
2704                 .mode           = 0644,
2705                 .proc_handler   = proc_dointvec_jiffies,
2706         },
2707         {
2708                 .procname       = "gc_interval",
2709                 .data           = &ip_rt_gc_interval,
2710                 .maxlen         = sizeof(int),
2711                 .mode           = 0644,
2712                 .proc_handler   = proc_dointvec_jiffies,
2713         },
2714         {
2715                 .procname       = "redirect_load",
2716                 .data           = &ip_rt_redirect_load,
2717                 .maxlen         = sizeof(int),
2718                 .mode           = 0644,
2719                 .proc_handler   = proc_dointvec,
2720         },
2721         {
2722                 .procname       = "redirect_number",
2723                 .data           = &ip_rt_redirect_number,
2724                 .maxlen         = sizeof(int),
2725                 .mode           = 0644,
2726                 .proc_handler   = proc_dointvec,
2727         },
2728         {
2729                 .procname       = "redirect_silence",
2730                 .data           = &ip_rt_redirect_silence,
2731                 .maxlen         = sizeof(int),
2732                 .mode           = 0644,
2733                 .proc_handler   = proc_dointvec,
2734         },
2735         {
2736                 .procname       = "error_cost",
2737                 .data           = &ip_rt_error_cost,
2738                 .maxlen         = sizeof(int),
2739                 .mode           = 0644,
2740                 .proc_handler   = proc_dointvec,
2741         },
2742         {
2743                 .procname       = "error_burst",
2744                 .data           = &ip_rt_error_burst,
2745                 .maxlen         = sizeof(int),
2746                 .mode           = 0644,
2747                 .proc_handler   = proc_dointvec,
2748         },
2749         {
2750                 .procname       = "gc_elasticity",
2751                 .data           = &ip_rt_gc_elasticity,
2752                 .maxlen         = sizeof(int),
2753                 .mode           = 0644,
2754                 .proc_handler   = proc_dointvec,
2755         },
2756         {
2757                 .procname       = "mtu_expires",
2758                 .data           = &ip_rt_mtu_expires,
2759                 .maxlen         = sizeof(int),
2760                 .mode           = 0644,
2761                 .proc_handler   = proc_dointvec_jiffies,
2762         },
2763         {
2764                 .procname       = "min_pmtu",
2765                 .data           = &ip_rt_min_pmtu,
2766                 .maxlen         = sizeof(int),
2767                 .mode           = 0644,
2768                 .proc_handler   = proc_dointvec,
2769         },
2770         {
2771                 .procname       = "min_adv_mss",
2772                 .data           = &ip_rt_min_advmss,
2773                 .maxlen         = sizeof(int),
2774                 .mode           = 0644,
2775                 .proc_handler   = proc_dointvec,
2776         },
2777         { }
2778 };
2779
2780 static struct ctl_table ipv4_route_flush_table[] = {
2781         {
2782                 .procname       = "flush",
2783                 .maxlen         = sizeof(int),
2784                 .mode           = 0200,
2785                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2786         },
2787         { },
2788 };
2789
2790 static __net_init int sysctl_route_net_init(struct net *net)
2791 {
2792         struct ctl_table *tbl;
2793
2794         tbl = ipv4_route_flush_table;
2795         if (!net_eq(net, &init_net)) {
2796                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2797                 if (!tbl)
2798                         goto err_dup;
2799
2800                 /* Don't export sysctls to unprivileged users */
2801                 if (net->user_ns != &init_user_ns)
2802                         tbl[0].procname = NULL;
2803         }
2804         tbl[0].extra1 = net;
2805
2806         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2807         if (!net->ipv4.route_hdr)
2808                 goto err_reg;
2809         return 0;
2810
2811 err_reg:
2812         if (tbl != ipv4_route_flush_table)
2813                 kfree(tbl);
2814 err_dup:
2815         return -ENOMEM;
2816 }
2817
2818 static __net_exit void sysctl_route_net_exit(struct net *net)
2819 {
2820         struct ctl_table *tbl;
2821
2822         tbl = net->ipv4.route_hdr->ctl_table_arg;
2823         unregister_net_sysctl_table(net->ipv4.route_hdr);
2824         BUG_ON(tbl == ipv4_route_flush_table);
2825         kfree(tbl);
2826 }
2827
2828 static __net_initdata struct pernet_operations sysctl_route_ops = {
2829         .init = sysctl_route_net_init,
2830         .exit = sysctl_route_net_exit,
2831 };
2832 #endif
2833
2834 static __net_init int rt_genid_init(struct net *net)
2835 {
2836         atomic_set(&net->ipv4.rt_genid, 0);
2837         atomic_set(&net->fnhe_genid, 0);
2838         get_random_bytes(&net->ipv4.dev_addr_genid,
2839                          sizeof(net->ipv4.dev_addr_genid));
2840         return 0;
2841 }
2842
2843 static __net_initdata struct pernet_operations rt_genid_ops = {
2844         .init = rt_genid_init,
2845 };
2846
2847 static int __net_init ipv4_inetpeer_init(struct net *net)
2848 {
2849         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2850
2851         if (!bp)
2852                 return -ENOMEM;
2853         inet_peer_base_init(bp);
2854         net->ipv4.peers = bp;
2855         return 0;
2856 }
2857
2858 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2859 {
2860         struct inet_peer_base *bp = net->ipv4.peers;
2861
2862         net->ipv4.peers = NULL;
2863         inetpeer_invalidate_tree(bp);
2864         kfree(bp);
2865 }
2866
2867 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2868         .init   =       ipv4_inetpeer_init,
2869         .exit   =       ipv4_inetpeer_exit,
2870 };
2871
2872 #ifdef CONFIG_IP_ROUTE_CLASSID
2873 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2874 #endif /* CONFIG_IP_ROUTE_CLASSID */
2875
2876 int __init ip_rt_init(void)
2877 {
2878         int rc = 0;
2879         int cpu;
2880
2881         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2882         if (!ip_idents)
2883                 panic("IP: failed to allocate ip_idents\n");
2884
2885         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2886
2887         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2888         if (!ip_tstamps)
2889                 panic("IP: failed to allocate ip_tstamps\n");
2890
2891         for_each_possible_cpu(cpu) {
2892                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2893
2894                 INIT_LIST_HEAD(&ul->head);
2895                 spin_lock_init(&ul->lock);
2896         }
2897 #ifdef CONFIG_IP_ROUTE_CLASSID
2898         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2899         if (!ip_rt_acct)
2900                 panic("IP: failed to allocate ip_rt_acct\n");
2901 #endif
2902
2903         ipv4_dst_ops.kmem_cachep =
2904                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2905                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2906
2907         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2908
2909         if (dst_entries_init(&ipv4_dst_ops) < 0)
2910                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2911
2912         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2913                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2914
2915         ipv4_dst_ops.gc_thresh = ~0;
2916         ip_rt_max_size = INT_MAX;
2917
2918         devinet_init();
2919         ip_fib_init();
2920
2921         if (ip_rt_proc_init())
2922                 pr_err("Unable to create route proc files\n");
2923 #ifdef CONFIG_XFRM
2924         xfrm_init();
2925         xfrm4_init();
2926 #endif
2927         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2928
2929 #ifdef CONFIG_SYSCTL
2930         register_pernet_subsys(&sysctl_route_ops);
2931 #endif
2932         register_pernet_subsys(&rt_genid_ops);
2933         register_pernet_subsys(&ipv4_inetpeer_ops);
2934         return rc;
2935 }
2936
2937 #ifdef CONFIG_SYSCTL
2938 /*
2939  * We really need to sanitize the damn ipv4 init order, then all
2940  * this nonsense will go away.
2941  */
2942 void __init ip_static_sysctl_init(void)
2943 {
2944         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2945 }
2946 #endif