net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <net/dst.h>
  93 #include <net/net_namespace.h>
  94 #include <net/protocol.h>
  95 #include <net/ip.h>
  96 #include <net/route.h>
  97 #include <net/inetpeer.h>
  98 #include <net/sock.h>
  99 #include <net/ip_fib.h>
 100 #include <net/arp.h>
 101 #include <net/tcp.h>
 102 #include <net/icmp.h>
 103 #include <net/xfrm.h>
 104 #include <net/netevent.h>
 105 #include <net/rtnetlink.h>
 106 #ifdef CONFIG_SYSCTL
 107 #include <linux/sysctl.h>
 108 #include <linux/kmemleak.h>
 109 #endif
 110 #include <net/secure_seq.h>
 111
 112 #define RT_FL_TOS(oldflp4) \
 113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_gc_elasticity __read_mostly    = 8;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133 /*
 134  *      Interface to generic destination cache.
 135  */
 136
 137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141 static void              ipv4_link_failure(struct sk_buff *skb);
 142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                            struct sk_buff *skb, u32 mtu);
 144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                         struct sk_buff *skb);
 146 static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 149                             int how)
 150 {
 151 }
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         WARN_ON(1);
 156         return NULL;
 157 }
 158
 159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                            struct sk_buff *skb,
 161                                            const void *daddr);
 162
 163 static struct dst_ops ipv4_dst_ops = {
 164         .family =               AF_INET,
 165         .protocol =             cpu_to_be16(ETH_P_IP),
 166         .check =                ipv4_dst_check,
 167         .default_advmss =       ipv4_default_advmss,
 168         .mtu =                  ipv4_mtu,
 169         .cow_metrics =          ipv4_cow_metrics,
 170         .destroy =              ipv4_dst_destroy,
 171         .ifdown =               ipv4_dst_ifdown,
 172         .negative_advice =      ipv4_negative_advice,
 173         .link_failure =         ipv4_link_failure,
 174         .update_pmtu =          ip_rt_update_pmtu,
 175         .redirect =             ip_do_redirect,
 176         .local_out =            __ip_local_out,
 177         .neigh_lookup =         ipv4_neigh_lookup,
 178 };
 179
 180 #define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182 const __u8 ip_tos2prio[16] = {
 183         TC_PRIO_BESTEFFORT,
 184         ECN_OR_COST(BESTEFFORT),
 185         TC_PRIO_BESTEFFORT,
 186         ECN_OR_COST(BESTEFFORT),
 187         TC_PRIO_BULK,
 188         ECN_OR_COST(BULK),
 189         TC_PRIO_BULK,
 190         ECN_OR_COST(BULK),
 191         TC_PRIO_INTERACTIVE,
 192         ECN_OR_COST(INTERACTIVE),
 193         TC_PRIO_INTERACTIVE,
 194         ECN_OR_COST(INTERACTIVE),
 195         TC_PRIO_INTERACTIVE_BULK,
 196         ECN_OR_COST(INTERACTIVE_BULK),
 197         TC_PRIO_INTERACTIVE_BULK,
 198         ECN_OR_COST(INTERACTIVE_BULK)
 199 };
 200 EXPORT_SYMBOL(ip_tos2prio);
 201
 202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 204
 205 static inline int rt_genid(struct net *net)
 206 {
 207         return atomic_read(&net->ipv4.rt_genid);
 208 }
 209
 210 #ifdef CONFIG_PROC_FS
 211 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 212 {
 213         if (*pos)
 214                 return NULL;
 215         return SEQ_START_TOKEN;
 216 }
 217
 218 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 219 {
 220         ++*pos;
 221         return NULL;
 222 }
 223
 224 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 225 {
 226 }
 227
 228 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 229 {
 230         if (v == SEQ_START_TOKEN)
 231                 seq_printf(seq, "%-127s\n",
 232                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 233                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 234                            "HHUptod\tSpecDst");
 235         return 0;
 236 }
 237
 238 static const struct seq_operations rt_cache_seq_ops = {
 239         .start  = rt_cache_seq_start,
 240         .next   = rt_cache_seq_next,
 241         .stop   = rt_cache_seq_stop,
 242         .show   = rt_cache_seq_show,
 243 };
 244
 245 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 246 {
 247         return seq_open(file, &rt_cache_seq_ops);
 248 }
 249
 250 static const struct file_operations rt_cache_seq_fops = {
 251         .owner   = THIS_MODULE,
 252         .open    = rt_cache_seq_open,
 253         .read    = seq_read,
 254         .llseek  = seq_lseek,
 255         .release = seq_release,
 256 };
 257
 258
 259 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 260 {
 261         int cpu;
 262
 263         if (*pos == 0)
 264                 return SEQ_START_TOKEN;
 265
 266         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 267                 if (!cpu_possible(cpu))
 268                         continue;
 269                 *pos = cpu+1;
 270                 return &per_cpu(rt_cache_stat, cpu);
 271         }
 272         return NULL;
 273 }
 274
 275 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 276 {
 277         int cpu;
 278
 279         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 280                 if (!cpu_possible(cpu))
 281                         continue;
 282                 *pos = cpu+1;
 283                 return &per_cpu(rt_cache_stat, cpu);
 284         }
 285         return NULL;
 286
 287 }
 288
 289 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 290 {
 291
 292 }
 293
 294 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 295 {
 296         struct rt_cache_stat *st = v;
 297
 298         if (v == SEQ_START_TOKEN) {
 299                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 300                 return 0;
 301         }
 302
 303         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 304                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 305                    dst_entries_get_slow(&ipv4_dst_ops),
 306                    st->in_hit,
 307                    st->in_slow_tot,
 308                    st->in_slow_mc,
 309                    st->in_no_route,
 310                    st->in_brd,
 311                    st->in_martian_dst,
 312                    st->in_martian_src,
 313
 314                    st->out_hit,
 315                    st->out_slow_tot,
 316                    st->out_slow_mc,
 317
 318                    st->gc_total,
 319                    st->gc_ignored,
 320                    st->gc_goal_miss,
 321                    st->gc_dst_overflow,
 322                    st->in_hlist_search,
 323                    st->out_hlist_search
 324                 );
 325         return 0;
 326 }
 327
 328 static const struct seq_operations rt_cpu_seq_ops = {
 329         .start  = rt_cpu_seq_start,
 330         .next   = rt_cpu_seq_next,
 331         .stop   = rt_cpu_seq_stop,
 332         .show   = rt_cpu_seq_show,
 333 };
 334
 335
 336 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 337 {
 338         return seq_open(file, &rt_cpu_seq_ops);
 339 }
 340
 341 static const struct file_operations rt_cpu_seq_fops = {
 342         .owner   = THIS_MODULE,
 343         .open    = rt_cpu_seq_open,
 344         .read    = seq_read,
 345         .llseek  = seq_lseek,
 346         .release = seq_release,
 347 };
 348
 349 #ifdef CONFIG_IP_ROUTE_CLASSID
 350 static int rt_acct_proc_show(struct seq_file *m, void *v)
 351 {
 352         struct ip_rt_acct *dst, *src;
 353         unsigned int i, j;
 354
 355         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 356         if (!dst)
 357                 return -ENOMEM;
 358
 359         for_each_possible_cpu(i) {
 360                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 361                 for (j = 0; j < 256; j++) {
 362                         dst[j].o_bytes   += src[j].o_bytes;
 363                         dst[j].o_packets += src[j].o_packets;
 364                         dst[j].i_bytes   += src[j].i_bytes;
 365                         dst[j].i_packets += src[j].i_packets;
 366                 }
 367         }
 368
 369         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 370         kfree(dst);
 371         return 0;
 372 }
 373
 374 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 375 {
 376         return single_open(file, rt_acct_proc_show, NULL);
 377 }
 378
 379 static const struct file_operations rt_acct_proc_fops = {
 380         .owner          = THIS_MODULE,
 381         .open           = rt_acct_proc_open,
 382         .read           = seq_read,
 383         .llseek         = seq_lseek,
 384         .release        = single_release,
 385 };
 386 #endif
 387
 388 static int __net_init ip_rt_do_proc_init(struct net *net)
 389 {
 390         struct proc_dir_entry *pde;
 391
 392         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 393                         &rt_cache_seq_fops);
 394         if (!pde)
 395                 goto err1;
 396
 397         pde = proc_create("rt_cache", S_IRUGO,
 398                           net->proc_net_stat, &rt_cpu_seq_fops);
 399         if (!pde)
 400                 goto err2;
 401
 402 #ifdef CONFIG_IP_ROUTE_CLASSID
 403         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 404         if (!pde)
 405                 goto err3;
 406 #endif
 407         return 0;
 408
 409 #ifdef CONFIG_IP_ROUTE_CLASSID
 410 err3:
 411         remove_proc_entry("rt_cache", net->proc_net_stat);
 412 #endif
 413 err2:
 414         remove_proc_entry("rt_cache", net->proc_net);
 415 err1:
 416         return -ENOMEM;
 417 }
 418
 419 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 420 {
 421         remove_proc_entry("rt_cache", net->proc_net_stat);
 422         remove_proc_entry("rt_cache", net->proc_net);
 423 #ifdef CONFIG_IP_ROUTE_CLASSID
 424         remove_proc_entry("rt_acct", net->proc_net);
 425 #endif
 426 }
 427
 428 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 429         .init = ip_rt_do_proc_init,
 430         .exit = ip_rt_do_proc_exit,
 431 };
 432
 433 static int __init ip_rt_proc_init(void)
 434 {
 435         return register_pernet_subsys(&ip_rt_proc_ops);
 436 }
 437
 438 #else
 439 static inline int ip_rt_proc_init(void)
 440 {
 441         return 0;
 442 }
 443 #endif /* CONFIG_PROC_FS */
 444
 445 static inline bool rt_is_expired(const struct rtable *rth)
 446 {
 447         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 448 }
 449
 450 /*
 451  * Perturbation of rt_genid by a small quantity [1..256]
 452  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 453  * many times (2^24) without giving recent rt_genid.
 454  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 455  */
 456 static void rt_cache_invalidate(struct net *net)
 457 {
 458         unsigned char shuffle;
 459
 460         get_random_bytes(&shuffle, sizeof(shuffle));
 461         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 462 }
 463
 464 /*
 465  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 466  * delay >= 0 : invalidate & flush cache (can be long)
 467  */
 468 void rt_cache_flush(struct net *net, int delay)
 469 {
 470         rt_cache_invalidate(net);
 471 }
 472
 473 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 474                                            struct sk_buff *skb,
 475                                            const void *daddr)
 476 {
 477         struct net_device *dev = dst->dev;
 478         const __be32 *pkey = daddr;
 479         const struct rtable *rt;
 480         struct neighbour *n;
 481
 482         rt = (const struct rtable *) dst;
 483         if (rt->rt_gateway)
 484                 pkey = (const __be32 *) &rt->rt_gateway;
 485         else if (skb)
 486                 pkey = &ip_hdr(skb)->daddr;
 487
 488         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 489         if (n)
 490                 return n;
 491         return neigh_create(&arp_tbl, pkey, dev);
 492 }
 493
 494 /*
 495  * Peer allocation may fail only in serious out-of-memory conditions.  However
 496  * we still can generate some output.
 497  * Random ID selection looks a bit dangerous because we have no chances to
 498  * select ID being unique in a reasonable period of time.
 499  * But broken packet identifier may be better than no packet at all.
 500  */
 501 static void ip_select_fb_ident(struct iphdr *iph)
 502 {
 503         static DEFINE_SPINLOCK(ip_fb_id_lock);
 504         static u32 ip_fallback_id;
 505         u32 salt;
 506
 507         spin_lock_bh(&ip_fb_id_lock);
 508         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 509         iph->id = htons(salt & 0xFFFF);
 510         ip_fallback_id = salt;
 511         spin_unlock_bh(&ip_fb_id_lock);
 512 }
 513
 514 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 515 {
 516         struct net *net = dev_net(dst->dev);
 517         struct inet_peer *peer;
 518
 519         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
 520         if (peer) {
 521                 iph->id = htons(inet_getid(peer, more));
 522                 inet_putpeer(peer);
 523                 return;
 524         }
 525
 526         ip_select_fb_ident(iph);
 527 }
 528 EXPORT_SYMBOL(__ip_select_ident);
 529
 530 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 531                              const struct iphdr *iph,
 532                              int oif, u8 tos,
 533                              u8 prot, u32 mark, int flow_flags)
 534 {
 535         if (sk) {
 536                 const struct inet_sock *inet = inet_sk(sk);
 537
 538                 oif = sk->sk_bound_dev_if;
 539                 mark = sk->sk_mark;
 540                 tos = RT_CONN_FLAGS(sk);
 541                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 542         }
 543         flowi4_init_output(fl4, oif, mark, tos,
 544                            RT_SCOPE_UNIVERSE, prot,
 545                            flow_flags,
 546                            iph->daddr, iph->saddr, 0, 0);
 547 }
 548
 549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 550                                const struct sock *sk)
 551 {
 552         const struct iphdr *iph = ip_hdr(skb);
 553         int oif = skb->dev->ifindex;
 554         u8 tos = RT_TOS(iph->tos);
 555         u8 prot = iph->protocol;
 556         u32 mark = skb->mark;
 557
 558         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 559 }
 560
 561 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 562 {
 563         const struct inet_sock *inet = inet_sk(sk);
 564         const struct ip_options_rcu *inet_opt;
 565         __be32 daddr = inet->inet_daddr;
 566
 567         rcu_read_lock();
 568         inet_opt = rcu_dereference(inet->inet_opt);
 569         if (inet_opt && inet_opt->opt.srr)
 570                 daddr = inet_opt->opt.faddr;
 571         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 572                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 573                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 574                            inet_sk_flowi_flags(sk),
 575                            daddr, inet->inet_saddr, 0, 0);
 576         rcu_read_unlock();
 577 }
 578
 579 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 580                                  const struct sk_buff *skb)
 581 {
 582         if (skb)
 583                 build_skb_flow_key(fl4, skb, sk);
 584         else
 585                 build_sk_flow_key(fl4, sk);
 586 }
 587
 588 static inline void rt_free(struct rtable *rt)
 589 {
 590         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 591 }
 592
 593 static DEFINE_SPINLOCK(fnhe_lock);
 594
 595 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 596 {
 597         struct fib_nh_exception *fnhe, *oldest;
 598         struct rtable *orig;
 599
 600         oldest = rcu_dereference(hash->chain);
 601         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 602              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 603                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 604                         oldest = fnhe;
 605         }
 606         orig = rcu_dereference(oldest->fnhe_rth);
 607         if (orig) {
 608                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
 609                 rt_free(orig);
 610         }
 611         return oldest;
 612 }
 613
 614 static inline u32 fnhe_hashfun(__be32 daddr)
 615 {
 616         u32 hval;
 617
 618         hval = (__force u32) daddr;
 619         hval ^= (hval >> 11) ^ (hval >> 22);
 620
 621         return hval & (FNHE_HASH_SIZE - 1);
 622 }
 623
 624 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 625                                   u32 pmtu, unsigned long expires)
 626 {
 627         struct fnhe_hash_bucket *hash;
 628         struct fib_nh_exception *fnhe;
 629         int depth;
 630         u32 hval = fnhe_hashfun(daddr);
 631
 632         spin_lock_bh(&fnhe_lock);
 633
 634         hash = nh->nh_exceptions;
 635         if (!hash) {
 636                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 637                 if (!hash)
 638                         goto out_unlock;
 639                 nh->nh_exceptions = hash;
 640         }
 641
 642         hash += hval;
 643
 644         depth = 0;
 645         for (fnhe = rcu_dereference(hash->chain); fnhe;
 646              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 647                 if (fnhe->fnhe_daddr == daddr)
 648                         break;
 649                 depth++;
 650         }
 651
 652         if (fnhe) {
 653                 if (gw)
 654                         fnhe->fnhe_gw = gw;
 655                 if (pmtu) {
 656                         fnhe->fnhe_pmtu = pmtu;
 657                         fnhe->fnhe_expires = expires;
 658                 }
 659         } else {
 660                 if (depth > FNHE_RECLAIM_DEPTH)
 661                         fnhe = fnhe_oldest(hash);
 662                 else {
 663                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 664                         if (!fnhe)
 665                                 goto out_unlock;
 666
 667                         fnhe->fnhe_next = hash->chain;
 668                         rcu_assign_pointer(hash->chain, fnhe);
 669                 }
 670                 fnhe->fnhe_daddr = daddr;
 671                 fnhe->fnhe_gw = gw;
 672                 fnhe->fnhe_pmtu = pmtu;
 673                 fnhe->fnhe_expires = expires;
 674         }
 675
 676         fnhe->fnhe_stamp = jiffies;
 677
 678 out_unlock:
 679         spin_unlock_bh(&fnhe_lock);
 680         return;
 681 }
 682
 683 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 684                              bool kill_route)
 685 {
 686         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 687         __be32 old_gw = ip_hdr(skb)->saddr;
 688         struct net_device *dev = skb->dev;
 689         struct in_device *in_dev;
 690         struct fib_result res;
 691         struct neighbour *n;
 692         struct net *net;
 693
 694         switch (icmp_hdr(skb)->code & 7) {
 695         case ICMP_REDIR_NET:
 696         case ICMP_REDIR_NETTOS:
 697         case ICMP_REDIR_HOST:
 698         case ICMP_REDIR_HOSTTOS:
 699                 break;
 700
 701         default:
 702                 return;
 703         }
 704
 705         if (rt->rt_gateway != old_gw)
 706                 return;
 707
 708         in_dev = __in_dev_get_rcu(dev);
 709         if (!in_dev)
 710                 return;
 711
 712         net = dev_net(dev);
 713         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 714             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 715             ipv4_is_zeronet(new_gw))
 716                 goto reject_redirect;
 717
 718         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 719                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 720                         goto reject_redirect;
 721                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 722                         goto reject_redirect;
 723         } else {
 724                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 725                         goto reject_redirect;
 726         }
 727
 728         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 729         if (n) {
 730                 if (!(n->nud_state & NUD_VALID)) {
 731                         neigh_event_send(n, NULL);
 732                 } else {
 733                         if (fib_lookup(net, fl4, &res) == 0) {
 734                                 struct fib_nh *nh = &FIB_RES_NH(res);
 735
 736                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 737                                                       0, 0);
 738                         }
 739                         if (kill_route)
 740                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 741                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 742                 }
 743                 neigh_release(n);
 744         }
 745         return;
 746
 747 reject_redirect:
 748 #ifdef CONFIG_IP_ROUTE_VERBOSE
 749         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 750                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 751                 __be32 daddr = iph->daddr;
 752                 __be32 saddr = iph->saddr;
 753
 754                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 755                                      "  Advised path = %pI4 -> %pI4\n",
 756                                      &old_gw, dev->name, &new_gw,
 757                                      &saddr, &daddr);
 758         }
 759 #endif
 760         ;
 761 }
 762
 763 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 764 {
 765         struct rtable *rt;
 766         struct flowi4 fl4;
 767
 768         rt = (struct rtable *) dst;
 769
 770         ip_rt_build_flow_key(&fl4, sk, skb);
 771         __ip_do_redirect(rt, skb, &fl4, true);
 772 }
 773
 774 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 775 {
 776         struct rtable *rt = (struct rtable *)dst;
 777         struct dst_entry *ret = dst;
 778
 779         if (rt) {
 780                 if (dst->obsolete > 0) {
 781                         ip_rt_put(rt);
 782                         ret = NULL;
 783                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 784                            rt->dst.expires) {
 785                         ip_rt_put(rt);
 786                         ret = NULL;
 787                 }
 788         }
 789         return ret;
 790 }
 791
 792 /*
 793  * Algorithm:
 794  *      1. The first ip_rt_redirect_number redirects are sent
 795  *         with exponential backoff, then we stop sending them at all,
 796  *         assuming that the host ignores our redirects.
 797  *      2. If we did not see packets requiring redirects
 798  *         during ip_rt_redirect_silence, we assume that the host
 799  *         forgot redirected route and start to send redirects again.
 800  *
 801  * This algorithm is much cheaper and more intelligent than dumb load limiting
 802  * in icmp.c.
 803  *
 804  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 805  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 806  */
 807
 808 void ip_rt_send_redirect(struct sk_buff *skb)
 809 {
 810         struct rtable *rt = skb_rtable(skb);
 811         struct in_device *in_dev;
 812         struct inet_peer *peer;
 813         struct net *net;
 814         int log_martians;
 815
 816         rcu_read_lock();
 817         in_dev = __in_dev_get_rcu(rt->dst.dev);
 818         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 819                 rcu_read_unlock();
 820                 return;
 821         }
 822         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 823         rcu_read_unlock();
 824
 825         net = dev_net(rt->dst.dev);
 826         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 827         if (!peer) {
 828                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 829                 return;
 830         }
 831
 832         /* No redirected packets during ip_rt_redirect_silence;
 833          * reset the algorithm.
 834          */
 835         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 836                 peer->rate_tokens = 0;
 837
 838         /* Too many ignored redirects; do not send anything
 839          * set dst.rate_last to the last seen redirected packet.
 840          */
 841         if (peer->rate_tokens >= ip_rt_redirect_number) {
 842                 peer->rate_last = jiffies;
 843                 goto out_put_peer;
 844         }
 845
 846         /* Check for load limit; set rate_last to the latest sent
 847          * redirect.
 848          */
 849         if (peer->rate_tokens == 0 ||
 850             time_after(jiffies,
 851                        (peer->rate_last +
 852                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 853                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 854                 peer->rate_last = jiffies;
 855                 ++peer->rate_tokens;
 856 #ifdef CONFIG_IP_ROUTE_VERBOSE
 857                 if (log_martians &&
 858                     peer->rate_tokens == ip_rt_redirect_number)
 859                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 860                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 861                                              &ip_hdr(skb)->daddr, &rt->rt_gateway);
 862 #endif
 863         }
 864 out_put_peer:
 865         inet_putpeer(peer);
 866 }
 867
 868 static int ip_error(struct sk_buff *skb)
 869 {
 870         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 871         struct rtable *rt = skb_rtable(skb);
 872         struct inet_peer *peer;
 873         unsigned long now;
 874         struct net *net;
 875         bool send;
 876         int code;
 877
 878         net = dev_net(rt->dst.dev);
 879         if (!IN_DEV_FORWARD(in_dev)) {
 880                 switch (rt->dst.error) {
 881                 case EHOSTUNREACH:
 882                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 883                         break;
 884
 885                 case ENETUNREACH:
 886                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 887                         break;
 888                 }
 889                 goto out;
 890         }
 891
 892         switch (rt->dst.error) {
 893         case EINVAL:
 894         default:
 895                 goto out;
 896         case EHOSTUNREACH:
 897                 code = ICMP_HOST_UNREACH;
 898                 break;
 899         case ENETUNREACH:
 900                 code = ICMP_NET_UNREACH;
 901                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 902                 break;
 903         case EACCES:
 904                 code = ICMP_PKT_FILTERED;
 905                 break;
 906         }
 907
 908         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 909
 910         send = true;
 911         if (peer) {
 912                 now = jiffies;
 913                 peer->rate_tokens += now - peer->rate_last;
 914                 if (peer->rate_tokens > ip_rt_error_burst)
 915                         peer->rate_tokens = ip_rt_error_burst;
 916                 peer->rate_last = now;
 917                 if (peer->rate_tokens >= ip_rt_error_cost)
 918                         peer->rate_tokens -= ip_rt_error_cost;
 919                 else
 920                         send = false;
 921                 inet_putpeer(peer);
 922         }
 923         if (send)
 924                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 925
 926 out:    kfree_skb(skb);
 927         return 0;
 928 }
 929
 930 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 931 {
 932         struct fib_result res;
 933
 934         if (mtu < ip_rt_min_pmtu)
 935                 mtu = ip_rt_min_pmtu;
 936
 937         rcu_read_lock();
 938         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
 939                 struct fib_nh *nh = &FIB_RES_NH(res);
 940
 941                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 942                                       jiffies + ip_rt_mtu_expires);
 943         }
 944         rcu_read_unlock();
 945         return mtu;
 946 }
 947
 948 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 949                               struct sk_buff *skb, u32 mtu)
 950 {
 951         struct rtable *rt = (struct rtable *) dst;
 952         struct flowi4 fl4;
 953
 954         ip_rt_build_flow_key(&fl4, sk, skb);
 955         mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
 956
 957         if (!rt->rt_pmtu) {
 958                 dst->obsolete = DST_OBSOLETE_KILL;
 959         } else {
 960                 rt->rt_pmtu = mtu;
 961                 rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
 962         }
 963 }
 964
 965 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 966                       int oif, u32 mark, u8 protocol, int flow_flags)
 967 {
 968         const struct iphdr *iph = (const struct iphdr *) skb->data;
 969         struct flowi4 fl4;
 970         struct rtable *rt;
 971
 972         __build_flow_key(&fl4, NULL, iph, oif,
 973                          RT_TOS(iph->tos), protocol, mark, flow_flags);
 974         rt = __ip_route_output_key(net, &fl4);
 975         if (!IS_ERR(rt)) {
 976                 __ip_rt_update_pmtu(rt, &fl4, mtu);
 977                 ip_rt_put(rt);
 978         }
 979 }
 980 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 981
 982 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 983 {
 984         const struct iphdr *iph = (const struct iphdr *) skb->data;
 985         struct flowi4 fl4;
 986         struct rtable *rt;
 987
 988         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 989         rt = __ip_route_output_key(sock_net(sk), &fl4);
 990         if (!IS_ERR(rt)) {
 991                 __ip_rt_update_pmtu(rt, &fl4, mtu);
 992                 ip_rt_put(rt);
 993         }
 994 }
 995 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 996
 997 void ipv4_redirect(struct sk_buff *skb, struct net *net,
 998                    int oif, u32 mark, u8 protocol, int flow_flags)
 999 {
1000         const struct iphdr *iph = (const struct iphdr *) skb->data;
1001         struct flowi4 fl4;
1002         struct rtable *rt;
1003
1004         __build_flow_key(&fl4, NULL, iph, oif,
1005                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1006         rt = __ip_route_output_key(net, &fl4);
1007         if (!IS_ERR(rt)) {
1008                 __ip_do_redirect(rt, skb, &fl4, false);
1009                 ip_rt_put(rt);
1010         }
1011 }
1012 EXPORT_SYMBOL_GPL(ipv4_redirect);
1013
1014 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1015 {
1016         const struct iphdr *iph = (const struct iphdr *) skb->data;
1017         struct flowi4 fl4;
1018         struct rtable *rt;
1019
1020         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1021         rt = __ip_route_output_key(sock_net(sk), &fl4);
1022         if (!IS_ERR(rt)) {
1023                 __ip_do_redirect(rt, skb, &fl4, false);
1024                 ip_rt_put(rt);
1025         }
1026 }
1027 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1028
1029 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1030 {
1031         struct rtable *rt = (struct rtable *) dst;
1032
1033         /* All IPV4 dsts are created with ->obsolete set to the value
1034          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1035          * into this function always.
1036          *
1037          * When a PMTU/redirect information update invalidates a
1038          * route, this is indicated by setting obsolete to
1039          * DST_OBSOLETE_KILL.
1040          */
1041         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1042                 return NULL;
1043         return dst;
1044 }
1045
1046 static void ipv4_link_failure(struct sk_buff *skb)
1047 {
1048         struct rtable *rt;
1049
1050         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1051
1052         rt = skb_rtable(skb);
1053         if (rt)
1054                 dst_set_expires(&rt->dst, 0);
1055 }
1056
1057 static int ip_rt_bug(struct sk_buff *skb)
1058 {
1059         pr_debug("%s: %pI4 -> %pI4, %s\n",
1060                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1061                  skb->dev ? skb->dev->name : "?");
1062         kfree_skb(skb);
1063         WARN_ON(1);
1064         return 0;
1065 }
1066
1067 /*
1068    We do not cache source address of outgoing interface,
1069    because it is used only by IP RR, TS and SRR options,
1070    so that it out of fast path.
1071
1072    BTW remember: "addr" is allowed to be not aligned
1073    in IP options!
1074  */
1075
1076 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1077 {
1078         __be32 src;
1079
1080         if (rt_is_output_route(rt))
1081                 src = ip_hdr(skb)->saddr;
1082         else {
1083                 struct fib_result res;
1084                 struct flowi4 fl4;
1085                 struct iphdr *iph;
1086
1087                 iph = ip_hdr(skb);
1088
1089                 memset(&fl4, 0, sizeof(fl4));
1090                 fl4.daddr = iph->daddr;
1091                 fl4.saddr = iph->saddr;
1092                 fl4.flowi4_tos = RT_TOS(iph->tos);
1093                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1094                 fl4.flowi4_iif = skb->dev->ifindex;
1095                 fl4.flowi4_mark = skb->mark;
1096
1097                 rcu_read_lock();
1098                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1099                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1100                 else
1101                         src = inet_select_addr(rt->dst.dev,
1102                                                rt_nexthop(rt, iph->daddr),
1103                                                RT_SCOPE_UNIVERSE);
1104                 rcu_read_unlock();
1105         }
1106         memcpy(addr, &src, 4);
1107 }
1108
1109 #ifdef CONFIG_IP_ROUTE_CLASSID
1110 static void set_class_tag(struct rtable *rt, u32 tag)
1111 {
1112         if (!(rt->dst.tclassid & 0xFFFF))
1113                 rt->dst.tclassid |= tag & 0xFFFF;
1114         if (!(rt->dst.tclassid & 0xFFFF0000))
1115                 rt->dst.tclassid |= tag & 0xFFFF0000;
1116 }
1117 #endif
1118
1119 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1120 {
1121         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1122
1123         if (advmss == 0) {
1124                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1125                                ip_rt_min_advmss);
1126                 if (advmss > 65535 - 40)
1127                         advmss = 65535 - 40;
1128         }
1129         return advmss;
1130 }
1131
1132 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1133 {
1134         const struct rtable *rt = (const struct rtable *) dst;
1135         unsigned int mtu = rt->rt_pmtu;
1136
1137         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1138                 mtu = 0;
1139
1140         if (!mtu)
1141                 mtu = dst_metric_raw(dst, RTAX_MTU);
1142
1143         if (mtu && rt_is_output_route(rt))
1144                 return mtu;
1145
1146         mtu = dst->dev->mtu;
1147
1148         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1149                 if (rt->rt_gateway && mtu > 576)
1150                         mtu = 576;
1151         }
1152
1153         if (mtu > IP_MAX_MTU)
1154                 mtu = IP_MAX_MTU;
1155
1156         return mtu;
1157 }
1158
1159 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1160 {
1161         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1162         struct fib_nh_exception *fnhe;
1163         u32 hval;
1164
1165         if (!hash)
1166                 return NULL;
1167
1168         hval = fnhe_hashfun(daddr);
1169
1170         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1171              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1172                 if (fnhe->fnhe_daddr == daddr)
1173                         return fnhe;
1174         }
1175         return NULL;
1176 }
1177
1178 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1179                               __be32 daddr)
1180 {
1181         bool ret = false;
1182
1183         spin_lock_bh(&fnhe_lock);
1184
1185         if (daddr == fnhe->fnhe_daddr) {
1186                 struct rtable *orig;
1187
1188                 if (fnhe->fnhe_pmtu) {
1189                         unsigned long expires = fnhe->fnhe_expires;
1190                         unsigned long diff = expires - jiffies;
1191
1192                         if (time_before(jiffies, expires)) {
1193                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1194                                 dst_set_expires(&rt->dst, diff);
1195                         }
1196                 }
1197                 if (fnhe->fnhe_gw) {
1198                         rt->rt_flags |= RTCF_REDIRECTED;
1199                         rt->rt_gateway = fnhe->fnhe_gw;
1200                 }
1201
1202                 orig = rcu_dereference(fnhe->fnhe_rth);
1203                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1204                 if (orig)
1205                         rt_free(orig);
1206
1207                 fnhe->fnhe_stamp = jiffies;
1208                 ret = true;
1209         } else {
1210                 /* Routes we intend to cache in nexthop exception have
1211                  * the DST_NOCACHE bit clear.  However, if we are
1212                  * unsuccessful at storing this route into the cache
1213                  * we really need to set it.
1214                  */
1215                 rt->dst.flags |= DST_NOCACHE;
1216         }
1217         spin_unlock_bh(&fnhe_lock);
1218
1219         return ret;
1220 }
1221
1222 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1223 {
1224         struct rtable *orig, *prev, **p;
1225         bool ret = true;
1226
1227         if (rt_is_input_route(rt)) {
1228                 p = (struct rtable **)&nh->nh_rth_input;
1229         } else {
1230                 if (!nh->nh_pcpu_rth_output)
1231                         goto nocache;
1232                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1233         }
1234         orig = *p;
1235
1236         prev = cmpxchg(p, orig, rt);
1237         if (prev == orig) {
1238                 if (orig)
1239                         rt_free(orig);
1240         } else {
1241                 /* Routes we intend to cache in the FIB nexthop have
1242                  * the DST_NOCACHE bit clear.  However, if we are
1243                  * unsuccessful at storing this route into the cache
1244                  * we really need to set it.
1245                  */
1246 nocache:
1247                 rt->dst.flags |= DST_NOCACHE;
1248                 ret = false;
1249         }
1250
1251         return ret;
1252 }
1253
1254 static DEFINE_SPINLOCK(rt_uncached_lock);
1255 static LIST_HEAD(rt_uncached_list);
1256
1257 static void rt_add_uncached_list(struct rtable *rt)
1258 {
1259         spin_lock_bh(&rt_uncached_lock);
1260         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1261         spin_unlock_bh(&rt_uncached_lock);
1262 }
1263
1264 static void ipv4_dst_destroy(struct dst_entry *dst)
1265 {
1266         struct rtable *rt = (struct rtable *) dst;
1267
1268         if (!list_empty(&rt->rt_uncached)) {
1269                 spin_lock_bh(&rt_uncached_lock);
1270                 list_del(&rt->rt_uncached);
1271                 spin_unlock_bh(&rt_uncached_lock);
1272         }
1273 }
1274
1275 void rt_flush_dev(struct net_device *dev)
1276 {
1277         if (!list_empty(&rt_uncached_list)) {
1278                 struct net *net = dev_net(dev);
1279                 struct rtable *rt;
1280
1281                 spin_lock_bh(&rt_uncached_lock);
1282                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1283                         if (rt->dst.dev != dev)
1284                                 continue;
1285                         rt->dst.dev = net->loopback_dev;
1286                         dev_hold(rt->dst.dev);
1287                         dev_put(dev);
1288                 }
1289                 spin_unlock_bh(&rt_uncached_lock);
1290         }
1291 }
1292
1293 static bool rt_cache_valid(const struct rtable *rt)
1294 {
1295         return  rt &&
1296                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1297                 !rt_is_expired(rt);
1298 }
1299
1300 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1301                            const struct fib_result *res,
1302                            struct fib_nh_exception *fnhe,
1303                            struct fib_info *fi, u16 type, u32 itag)
1304 {
1305         bool cached = false;
1306
1307         if (fi) {
1308                 struct fib_nh *nh = &FIB_RES_NH(*res);
1309
1310                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1311                         rt->rt_gateway = nh->nh_gw;
1312                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1313 #ifdef CONFIG_IP_ROUTE_CLASSID
1314                 rt->dst.tclassid = nh->nh_tclassid;
1315 #endif
1316                 if (unlikely(fnhe))
1317                         cached = rt_bind_exception(rt, fnhe, daddr);
1318                 else if (!(rt->dst.flags & DST_NOCACHE))
1319                         cached = rt_cache_route(nh, rt);
1320         }
1321         if (unlikely(!cached))
1322                 rt_add_uncached_list(rt);
1323
1324 #ifdef CONFIG_IP_ROUTE_CLASSID
1325 #ifdef CONFIG_IP_MULTIPLE_TABLES
1326         set_class_tag(rt, res->tclassid);
1327 #endif
1328         set_class_tag(rt, itag);
1329 #endif
1330 }
1331
1332 static struct rtable *rt_dst_alloc(struct net_device *dev,
1333                                    bool nopolicy, bool noxfrm, bool will_cache)
1334 {
1335         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1336                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1337                          (nopolicy ? DST_NOPOLICY : 0) |
1338                          (noxfrm ? DST_NOXFRM : 0));
1339 }
1340
1341 /* called in rcu_read_lock() section */
1342 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1343                                 u8 tos, struct net_device *dev, int our)
1344 {
1345         struct rtable *rth;
1346         struct in_device *in_dev = __in_dev_get_rcu(dev);
1347         u32 itag = 0;
1348         int err;
1349
1350         /* Primary sanity checks. */
1351
1352         if (in_dev == NULL)
1353                 return -EINVAL;
1354
1355         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1356             skb->protocol != htons(ETH_P_IP))
1357                 goto e_inval;
1358
1359         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1360                 if (ipv4_is_loopback(saddr))
1361                         goto e_inval;
1362
1363         if (ipv4_is_zeronet(saddr)) {
1364                 if (!ipv4_is_local_multicast(daddr))
1365                         goto e_inval;
1366         } else {
1367                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1368                                           in_dev, &itag);
1369                 if (err < 0)
1370                         goto e_err;
1371         }
1372         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1373                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1374         if (!rth)
1375                 goto e_nobufs;
1376
1377 #ifdef CONFIG_IP_ROUTE_CLASSID
1378         rth->dst.tclassid = itag;
1379 #endif
1380         rth->dst.output = ip_rt_bug;
1381
1382         rth->rt_genid   = rt_genid(dev_net(dev));
1383         rth->rt_flags   = RTCF_MULTICAST;
1384         rth->rt_type    = RTN_MULTICAST;
1385         rth->rt_is_input= 1;
1386         rth->rt_iif     = 0;
1387         rth->rt_pmtu    = 0;
1388         rth->rt_gateway = 0;
1389         INIT_LIST_HEAD(&rth->rt_uncached);
1390         if (our) {
1391                 rth->dst.input= ip_local_deliver;
1392                 rth->rt_flags |= RTCF_LOCAL;
1393         }
1394
1395 #ifdef CONFIG_IP_MROUTE
1396         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1397                 rth->dst.input = ip_mr_input;
1398 #endif
1399         RT_CACHE_STAT_INC(in_slow_mc);
1400
1401         skb_dst_set(skb, &rth->dst);
1402         return 0;
1403
1404 e_nobufs:
1405         return -ENOBUFS;
1406 e_inval:
1407         return -EINVAL;
1408 e_err:
1409         return err;
1410 }
1411
1412
1413 static void ip_handle_martian_source(struct net_device *dev,
1414                                      struct in_device *in_dev,
1415                                      struct sk_buff *skb,
1416                                      __be32 daddr,
1417                                      __be32 saddr)
1418 {
1419         RT_CACHE_STAT_INC(in_martian_src);
1420 #ifdef CONFIG_IP_ROUTE_VERBOSE
1421         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1422                 /*
1423                  *      RFC1812 recommendation, if source is martian,
1424                  *      the only hint is MAC header.
1425                  */
1426                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1427                         &daddr, &saddr, dev->name);
1428                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1429                         print_hex_dump(KERN_WARNING, "ll header: ",
1430                                        DUMP_PREFIX_OFFSET, 16, 1,
1431                                        skb_mac_header(skb),
1432                                        dev->hard_header_len, true);
1433                 }
1434         }
1435 #endif
1436 }
1437
1438 /* called in rcu_read_lock() section */
1439 static int __mkroute_input(struct sk_buff *skb,
1440                            const struct fib_result *res,
1441                            struct in_device *in_dev,
1442                            __be32 daddr, __be32 saddr, u32 tos)
1443 {
1444         struct rtable *rth;
1445         int err;
1446         struct in_device *out_dev;
1447         unsigned int flags = 0;
1448         bool do_cache;
1449         u32 itag;
1450
1451         /* get a working reference to the output device */
1452         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1453         if (out_dev == NULL) {
1454                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1455                 return -EINVAL;
1456         }
1457
1458
1459         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1460                                   in_dev->dev, in_dev, &itag);
1461         if (err < 0) {
1462                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1463                                          saddr);
1464
1465                 goto cleanup;
1466         }
1467
1468         if (out_dev == in_dev && err &&
1469             (IN_DEV_SHARED_MEDIA(out_dev) ||
1470              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1471                 flags |= RTCF_DOREDIRECT;
1472
1473         if (skb->protocol != htons(ETH_P_IP)) {
1474                 /* Not IP (i.e. ARP). Do not create route, if it is
1475                  * invalid for proxy arp. DNAT routes are always valid.
1476                  *
1477                  * Proxy arp feature have been extended to allow, ARP
1478                  * replies back to the same interface, to support
1479                  * Private VLAN switch technologies. See arp.c.
1480                  */
1481                 if (out_dev == in_dev &&
1482                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1483                         err = -EINVAL;
1484                         goto cleanup;
1485                 }
1486         }
1487
1488         do_cache = false;
1489         if (res->fi) {
1490                 if (!itag) {
1491                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1492                         if (rt_cache_valid(rth)) {
1493                                 skb_dst_set_noref(skb, &rth->dst);
1494                                 goto out;
1495                         }
1496                         do_cache = true;
1497                 }
1498         }
1499
1500         rth = rt_dst_alloc(out_dev->dev,
1501                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1502                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1503         if (!rth) {
1504                 err = -ENOBUFS;
1505                 goto cleanup;
1506         }
1507
1508         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1509         rth->rt_flags = flags;
1510         rth->rt_type = res->type;
1511         rth->rt_is_input = 1;
1512         rth->rt_iif     = 0;
1513         rth->rt_pmtu    = 0;
1514         rth->rt_gateway = 0;
1515         INIT_LIST_HEAD(&rth->rt_uncached);
1516
1517         rth->dst.input = ip_forward;
1518         rth->dst.output = ip_output;
1519
1520         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1521         skb_dst_set(skb, &rth->dst);
1522 out:
1523         err = 0;
1524  cleanup:
1525         return err;
1526 }
1527
1528 static int ip_mkroute_input(struct sk_buff *skb,
1529                             struct fib_result *res,
1530                             const struct flowi4 *fl4,
1531                             struct in_device *in_dev,
1532                             __be32 daddr, __be32 saddr, u32 tos)
1533 {
1534 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1535         if (res->fi && res->fi->fib_nhs > 1)
1536                 fib_select_multipath(res);
1537 #endif
1538
1539         /* create a routing cache entry */
1540         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1541 }
1542
1543 /*
1544  *      NOTE. We drop all the packets that has local source
1545  *      addresses, because every properly looped back packet
1546  *      must have correct destination already attached by output routine.
1547  *
1548  *      Such approach solves two big problems:
1549  *      1. Not simplex devices are handled properly.
1550  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1551  *      called with rcu_read_lock()
1552  */
1553
1554 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1555                                u8 tos, struct net_device *dev)
1556 {
1557         struct fib_result res;
1558         struct in_device *in_dev = __in_dev_get_rcu(dev);
1559         struct flowi4   fl4;
1560         unsigned int    flags = 0;
1561         u32             itag = 0;
1562         struct rtable   *rth;
1563         int             err = -EINVAL;
1564         struct net    *net = dev_net(dev);
1565         bool do_cache;
1566
1567         /* IP on this device is disabled. */
1568
1569         if (!in_dev)
1570                 goto out;
1571
1572         /* Check for the most weird martians, which can be not detected
1573            by fib_lookup.
1574          */
1575
1576         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1577                 goto martian_source;
1578
1579         res.fi = NULL;
1580         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1581                 goto brd_input;
1582
1583         /* Accept zero addresses only to limited broadcast;
1584          * I even do not know to fix it or not. Waiting for complains :-)
1585          */
1586         if (ipv4_is_zeronet(saddr))
1587                 goto martian_source;
1588
1589         if (ipv4_is_zeronet(daddr))
1590                 goto martian_destination;
1591
1592         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1593                 if (ipv4_is_loopback(daddr))
1594                         goto martian_destination;
1595
1596                 if (ipv4_is_loopback(saddr))
1597                         goto martian_source;
1598         }
1599
1600         /*
1601          *      Now we are ready to route packet.
1602          */
1603         fl4.flowi4_oif = 0;
1604         fl4.flowi4_iif = dev->ifindex;
1605         fl4.flowi4_mark = skb->mark;
1606         fl4.flowi4_tos = tos;
1607         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1608         fl4.daddr = daddr;
1609         fl4.saddr = saddr;
1610         err = fib_lookup(net, &fl4, &res);
1611         if (err != 0)
1612                 goto no_route;
1613
1614         RT_CACHE_STAT_INC(in_slow_tot);
1615
1616         if (res.type == RTN_BROADCAST)
1617                 goto brd_input;
1618
1619         if (res.type == RTN_LOCAL) {
1620                 err = fib_validate_source(skb, saddr, daddr, tos,
1621                                           net->loopback_dev->ifindex,
1622                                           dev, in_dev, &itag);
1623                 if (err < 0)
1624                         goto martian_source_keep_err;
1625                 goto local_input;
1626         }
1627
1628         if (!IN_DEV_FORWARD(in_dev))
1629                 goto no_route;
1630         if (res.type != RTN_UNICAST)
1631                 goto martian_destination;
1632
1633         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1634 out:    return err;
1635
1636 brd_input:
1637         if (skb->protocol != htons(ETH_P_IP))
1638                 goto e_inval;
1639
1640         if (!ipv4_is_zeronet(saddr)) {
1641                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1642                                           in_dev, &itag);
1643                 if (err < 0)
1644                         goto martian_source_keep_err;
1645         }
1646         flags |= RTCF_BROADCAST;
1647         res.type = RTN_BROADCAST;
1648         RT_CACHE_STAT_INC(in_brd);
1649
1650 local_input:
1651         do_cache = false;
1652         if (res.fi) {
1653                 if (!itag) {
1654                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1655                         if (rt_cache_valid(rth)) {
1656                                 skb_dst_set_noref(skb, &rth->dst);
1657                                 err = 0;
1658                                 goto out;
1659                         }
1660                         do_cache = true;
1661                 }
1662         }
1663
1664         rth = rt_dst_alloc(net->loopback_dev,
1665                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1666         if (!rth)
1667                 goto e_nobufs;
1668
1669         rth->dst.input= ip_local_deliver;
1670         rth->dst.output= ip_rt_bug;
1671 #ifdef CONFIG_IP_ROUTE_CLASSID
1672         rth->dst.tclassid = itag;
1673 #endif
1674
1675         rth->rt_genid = rt_genid(net);
1676         rth->rt_flags   = flags|RTCF_LOCAL;
1677         rth->rt_type    = res.type;
1678         rth->rt_is_input = 1;
1679         rth->rt_iif     = 0;
1680         rth->rt_pmtu    = 0;
1681         rth->rt_gateway = 0;
1682         INIT_LIST_HEAD(&rth->rt_uncached);
1683         if (res.type == RTN_UNREACHABLE) {
1684                 rth->dst.input= ip_error;
1685                 rth->dst.error= -err;
1686                 rth->rt_flags   &= ~RTCF_LOCAL;
1687         }
1688         if (do_cache)
1689                 rt_cache_route(&FIB_RES_NH(res), rth);
1690         skb_dst_set(skb, &rth->dst);
1691         err = 0;
1692         goto out;
1693
1694 no_route:
1695         RT_CACHE_STAT_INC(in_no_route);
1696         res.type = RTN_UNREACHABLE;
1697         if (err == -ESRCH)
1698                 err = -ENETUNREACH;
1699         goto local_input;
1700
1701         /*
1702          *      Do not cache martian addresses: they should be logged (RFC1812)
1703          */
1704 martian_destination:
1705         RT_CACHE_STAT_INC(in_martian_dst);
1706 #ifdef CONFIG_IP_ROUTE_VERBOSE
1707         if (IN_DEV_LOG_MARTIANS(in_dev))
1708                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1709                                      &daddr, &saddr, dev->name);
1710 #endif
1711
1712 e_inval:
1713         err = -EINVAL;
1714         goto out;
1715
1716 e_nobufs:
1717         err = -ENOBUFS;
1718         goto out;
1719
1720 martian_source:
1721         err = -EINVAL;
1722 martian_source_keep_err:
1723         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1724         goto out;
1725 }
1726
1727 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1728                          u8 tos, struct net_device *dev)
1729 {
1730         int res;
1731
1732         rcu_read_lock();
1733
1734         /* Multicast recognition logic is moved from route cache to here.
1735            The problem was that too many Ethernet cards have broken/missing
1736            hardware multicast filters :-( As result the host on multicasting
1737            network acquires a lot of useless route cache entries, sort of
1738            SDR messages from all the world. Now we try to get rid of them.
1739            Really, provided software IP multicast filter is organized
1740            reasonably (at least, hashed), it does not result in a slowdown
1741            comparing with route cache reject entries.
1742            Note, that multicast routers are not affected, because
1743            route cache entry is created eventually.
1744          */
1745         if (ipv4_is_multicast(daddr)) {
1746                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1747
1748                 if (in_dev) {
1749                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1750                                                   ip_hdr(skb)->protocol);
1751                         if (our
1752 #ifdef CONFIG_IP_MROUTE
1753                                 ||
1754                             (!ipv4_is_local_multicast(daddr) &&
1755                              IN_DEV_MFORWARD(in_dev))
1756 #endif
1757                            ) {
1758                                 int res = ip_route_input_mc(skb, daddr, saddr,
1759                                                             tos, dev, our);
1760                                 rcu_read_unlock();
1761                                 return res;
1762                         }
1763                 }
1764                 rcu_read_unlock();
1765                 return -EINVAL;
1766         }
1767         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1768         rcu_read_unlock();
1769         return res;
1770 }
1771 EXPORT_SYMBOL(ip_route_input_noref);
1772
1773 /* called with rcu_read_lock() */
1774 static struct rtable *__mkroute_output(const struct fib_result *res,
1775                                        const struct flowi4 *fl4, int orig_oif,
1776                                        struct net_device *dev_out,
1777                                        unsigned int flags)
1778 {
1779         struct fib_info *fi = res->fi;
1780         struct fib_nh_exception *fnhe;
1781         struct in_device *in_dev;
1782         u16 type = res->type;
1783         struct rtable *rth;
1784
1785         in_dev = __in_dev_get_rcu(dev_out);
1786         if (!in_dev)
1787                 return ERR_PTR(-EINVAL);
1788
1789         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1790                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1791                         return ERR_PTR(-EINVAL);
1792
1793         if (ipv4_is_lbcast(fl4->daddr))
1794                 type = RTN_BROADCAST;
1795         else if (ipv4_is_multicast(fl4->daddr))
1796                 type = RTN_MULTICAST;
1797         else if (ipv4_is_zeronet(fl4->daddr))
1798                 return ERR_PTR(-EINVAL);
1799
1800         if (dev_out->flags & IFF_LOOPBACK)
1801                 flags |= RTCF_LOCAL;
1802
1803         if (type == RTN_BROADCAST) {
1804                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1805                 fi = NULL;
1806         } else if (type == RTN_MULTICAST) {
1807                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1808                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1809                                      fl4->flowi4_proto))
1810                         flags &= ~RTCF_LOCAL;
1811                 /* If multicast route do not exist use
1812                  * default one, but do not gateway in this case.
1813                  * Yes, it is hack.
1814                  */
1815                 if (fi && res->prefixlen < 4)
1816                         fi = NULL;
1817         }
1818
1819         fnhe = NULL;
1820         if (fi) {
1821                 struct rtable __rcu **prth;
1822
1823                 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1824                 if (fnhe)
1825                         prth = &fnhe->fnhe_rth;
1826                 else
1827                         prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1828                 rth = rcu_dereference(*prth);
1829                 if (rt_cache_valid(rth)) {
1830                         dst_hold(&rth->dst);
1831                         return rth;
1832                 }
1833         }
1834         rth = rt_dst_alloc(dev_out,
1835                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1836                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1837                            fi);
1838         if (!rth)
1839                 return ERR_PTR(-ENOBUFS);
1840
1841         rth->dst.output = ip_output;
1842
1843         rth->rt_genid = rt_genid(dev_net(dev_out));
1844         rth->rt_flags   = flags;
1845         rth->rt_type    = type;
1846         rth->rt_is_input = 0;
1847         rth->rt_iif     = orig_oif ? : 0;
1848         rth->rt_pmtu    = 0;
1849         rth->rt_gateway = 0;
1850         INIT_LIST_HEAD(&rth->rt_uncached);
1851
1852         RT_CACHE_STAT_INC(out_slow_tot);
1853
1854         if (flags & RTCF_LOCAL)
1855                 rth->dst.input = ip_local_deliver;
1856         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1857                 if (flags & RTCF_LOCAL &&
1858                     !(dev_out->flags & IFF_LOOPBACK)) {
1859                         rth->dst.output = ip_mc_output;
1860                         RT_CACHE_STAT_INC(out_slow_mc);
1861                 }
1862 #ifdef CONFIG_IP_MROUTE
1863                 if (type == RTN_MULTICAST) {
1864                         if (IN_DEV_MFORWARD(in_dev) &&
1865                             !ipv4_is_local_multicast(fl4->daddr)) {
1866                                 rth->dst.input = ip_mr_input;
1867                                 rth->dst.output = ip_mc_output;
1868                         }
1869                 }
1870 #endif
1871         }
1872
1873         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1874
1875         return rth;
1876 }
1877
1878 /*
1879  * Major route resolver routine.
1880  */
1881
1882 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1883 {
1884         struct net_device *dev_out = NULL;
1885         __u8 tos = RT_FL_TOS(fl4);
1886         unsigned int flags = 0;
1887         struct fib_result res;
1888         struct rtable *rth;
1889         int orig_oif;
1890
1891         res.tclassid    = 0;
1892         res.fi          = NULL;
1893         res.table       = NULL;
1894
1895         orig_oif = fl4->flowi4_oif;
1896
1897         fl4->flowi4_iif = net->loopback_dev->ifindex;
1898         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1899         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1900                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1901
1902         rcu_read_lock();
1903         if (fl4->saddr) {
1904                 rth = ERR_PTR(-EINVAL);
1905                 if (ipv4_is_multicast(fl4->saddr) ||
1906                     ipv4_is_lbcast(fl4->saddr) ||
1907                     ipv4_is_zeronet(fl4->saddr))
1908                         goto out;
1909
1910                 /* I removed check for oif == dev_out->oif here.
1911                    It was wrong for two reasons:
1912                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1913                       is assigned to multiple interfaces.
1914                    2. Moreover, we are allowed to send packets with saddr
1915                       of another iface. --ANK
1916                  */
1917
1918                 if (fl4->flowi4_oif == 0 &&
1919                     (ipv4_is_multicast(fl4->daddr) ||
1920                      ipv4_is_lbcast(fl4->daddr))) {
1921                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1922                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1923                         if (dev_out == NULL)
1924                                 goto out;
1925
1926                         /* Special hack: user can direct multicasts
1927                            and limited broadcast via necessary interface
1928                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1929                            This hack is not just for fun, it allows
1930                            vic,vat and friends to work.
1931                            They bind socket to loopback, set ttl to zero
1932                            and expect that it will work.
1933                            From the viewpoint of routing cache they are broken,
1934                            because we are not allowed to build multicast path
1935                            with loopback source addr (look, routing cache
1936                            cannot know, that ttl is zero, so that packet
1937                            will not leave this host and route is valid).
1938                            Luckily, this hack is good workaround.
1939                          */
1940
1941                         fl4->flowi4_oif = dev_out->ifindex;
1942                         goto make_route;
1943                 }
1944
1945                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1946                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1947                         if (!__ip_dev_find(net, fl4->saddr, false))
1948                                 goto out;
1949                 }
1950         }
1951
1952
1953         if (fl4->flowi4_oif) {
1954                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1955                 rth = ERR_PTR(-ENODEV);
1956                 if (dev_out == NULL)
1957                         goto out;
1958
1959                 /* RACE: Check return value of inet_select_addr instead. */
1960                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1961                         rth = ERR_PTR(-ENETUNREACH);
1962                         goto out;
1963                 }
1964                 if (ipv4_is_local_multicast(fl4->daddr) ||
1965                     ipv4_is_lbcast(fl4->daddr)) {
1966                         if (!fl4->saddr)
1967                                 fl4->saddr = inet_select_addr(dev_out, 0,
1968                                                               RT_SCOPE_LINK);
1969                         goto make_route;
1970                 }
1971                 if (fl4->saddr) {
1972                         if (ipv4_is_multicast(fl4->daddr))
1973                                 fl4->saddr = inet_select_addr(dev_out, 0,
1974                                                               fl4->flowi4_scope);
1975                         else if (!fl4->daddr)
1976                                 fl4->saddr = inet_select_addr(dev_out, 0,
1977                                                               RT_SCOPE_HOST);
1978                 }
1979         }
1980
1981         if (!fl4->daddr) {
1982                 fl4->daddr = fl4->saddr;
1983                 if (!fl4->daddr)
1984                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1985                 dev_out = net->loopback_dev;
1986                 fl4->flowi4_oif = net->loopback_dev->ifindex;
1987                 res.type = RTN_LOCAL;
1988                 flags |= RTCF_LOCAL;
1989                 goto make_route;
1990         }
1991
1992         if (fib_lookup(net, fl4, &res)) {
1993                 res.fi = NULL;
1994                 res.table = NULL;
1995                 if (fl4->flowi4_oif) {
1996                         /* Apparently, routing tables are wrong. Assume,
1997                            that the destination is on link.
1998
1999                            WHY? DW.
2000                            Because we are allowed to send to iface
2001                            even if it has NO routes and NO assigned
2002                            addresses. When oif is specified, routing
2003                            tables are looked up with only one purpose:
2004                            to catch if destination is gatewayed, rather than
2005                            direct. Moreover, if MSG_DONTROUTE is set,
2006                            we send packet, ignoring both routing tables
2007                            and ifaddr state. --ANK
2008
2009
2010                            We could make it even if oif is unknown,
2011                            likely IPv6, but we do not.
2012                          */
2013
2014                         if (fl4->saddr == 0)
2015                                 fl4->saddr = inet_select_addr(dev_out, 0,
2016                                                               RT_SCOPE_LINK);
2017                         res.type = RTN_UNICAST;
2018                         goto make_route;
2019                 }
2020                 rth = ERR_PTR(-ENETUNREACH);
2021                 goto out;
2022         }
2023
2024         if (res.type == RTN_LOCAL) {
2025                 if (!fl4->saddr) {
2026                         if (res.fi->fib_prefsrc)
2027                                 fl4->saddr = res.fi->fib_prefsrc;
2028                         else
2029                                 fl4->saddr = fl4->daddr;
2030                 }
2031                 dev_out = net->loopback_dev;
2032                 fl4->flowi4_oif = dev_out->ifindex;
2033                 flags |= RTCF_LOCAL;
2034                 goto make_route;
2035         }
2036
2037 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2038         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2039                 fib_select_multipath(&res);
2040         else
2041 #endif
2042         if (!res.prefixlen &&
2043             res.table->tb_num_default > 1 &&
2044             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2045                 fib_select_default(&res);
2046
2047         if (!fl4->saddr)
2048                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2049
2050         dev_out = FIB_RES_DEV(res);
2051         fl4->flowi4_oif = dev_out->ifindex;
2052
2053
2054 make_route:
2055         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2056
2057 out:
2058         rcu_read_unlock();
2059         return rth;
2060 }
2061 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2062
2063 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2064 {
2065         return NULL;
2066 }
2067
2068 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2069 {
2070         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2071
2072         return mtu ? : dst->dev->mtu;
2073 }
2074
2075 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2076                                           struct sk_buff *skb, u32 mtu)
2077 {
2078 }
2079
2080 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2081                                        struct sk_buff *skb)
2082 {
2083 }
2084
2085 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2086                                           unsigned long old)
2087 {
2088         return NULL;
2089 }
2090
2091 static struct dst_ops ipv4_dst_blackhole_ops = {
2092         .family                 =       AF_INET,
2093         .protocol               =       cpu_to_be16(ETH_P_IP),
2094         .check                  =       ipv4_blackhole_dst_check,
2095         .mtu                    =       ipv4_blackhole_mtu,
2096         .default_advmss         =       ipv4_default_advmss,
2097         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2098         .redirect               =       ipv4_rt_blackhole_redirect,
2099         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2100         .neigh_lookup           =       ipv4_neigh_lookup,
2101 };
2102
2103 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2104 {
2105         struct rtable *ort = (struct rtable *) dst_orig;
2106         struct rtable *rt;
2107
2108         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2109         if (rt) {
2110                 struct dst_entry *new = &rt->dst;
2111
2112                 new->__use = 1;
2113                 new->input = dst_discard;
2114                 new->output = dst_discard;
2115
2116                 new->dev = ort->dst.dev;
2117                 if (new->dev)
2118                         dev_hold(new->dev);
2119
2120                 rt->rt_is_input = ort->rt_is_input;
2121                 rt->rt_iif = ort->rt_iif;
2122                 rt->rt_pmtu = ort->rt_pmtu;
2123
2124                 rt->rt_genid = rt_genid(net);
2125                 rt->rt_flags = ort->rt_flags;
2126                 rt->rt_type = ort->rt_type;
2127                 rt->rt_gateway = ort->rt_gateway;
2128
2129                 INIT_LIST_HEAD(&rt->rt_uncached);
2130
2131                 dst_free(new);
2132         }
2133
2134         dst_release(dst_orig);
2135
2136         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2137 }
2138
2139 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2140                                     struct sock *sk)
2141 {
2142         struct rtable *rt = __ip_route_output_key(net, flp4);
2143
2144         if (IS_ERR(rt))
2145                 return rt;
2146
2147         if (flp4->flowi4_proto)
2148                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2149                                                    flowi4_to_flowi(flp4),
2150                                                    sk, 0);
2151
2152         return rt;
2153 }
2154 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2155
2156 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2157                         struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2158                         u32 seq, int event, int nowait, unsigned int flags)
2159 {
2160         struct rtable *rt = skb_rtable(skb);
2161         struct rtmsg *r;
2162         struct nlmsghdr *nlh;
2163         unsigned long expires = 0;
2164         u32 error;
2165         u32 metrics[RTAX_MAX];
2166
2167         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2168         if (nlh == NULL)
2169                 return -EMSGSIZE;
2170
2171         r = nlmsg_data(nlh);
2172         r->rtm_family    = AF_INET;
2173         r->rtm_dst_len  = 32;
2174         r->rtm_src_len  = 0;
2175         r->rtm_tos      = fl4->flowi4_tos;
2176         r->rtm_table    = RT_TABLE_MAIN;
2177         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2178                 goto nla_put_failure;
2179         r->rtm_type     = rt->rt_type;
2180         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2181         r->rtm_protocol = RTPROT_UNSPEC;
2182         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2183         if (rt->rt_flags & RTCF_NOTIFY)
2184                 r->rtm_flags |= RTM_F_NOTIFY;
2185
2186         if (nla_put_be32(skb, RTA_DST, dst))
2187                 goto nla_put_failure;
2188         if (src) {
2189                 r->rtm_src_len = 32;
2190                 if (nla_put_be32(skb, RTA_SRC, src))
2191                         goto nla_put_failure;
2192         }
2193         if (rt->dst.dev &&
2194             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2195                 goto nla_put_failure;
2196 #ifdef CONFIG_IP_ROUTE_CLASSID
2197         if (rt->dst.tclassid &&
2198             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2199                 goto nla_put_failure;
2200 #endif
2201         if (!rt_is_input_route(rt) &&
2202             fl4->saddr != src) {
2203                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2204                         goto nla_put_failure;
2205         }
2206         if (rt->rt_gateway &&
2207             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2208                 goto nla_put_failure;
2209
2210         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2211         if (rt->rt_pmtu)
2212                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2213         if (rtnetlink_put_metrics(skb, metrics) < 0)
2214                 goto nla_put_failure;
2215
2216         if (fl4->flowi4_mark &&
2217             nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2218                 goto nla_put_failure;
2219
2220         error = rt->dst.error;
2221         expires = rt->dst.expires;
2222         if (expires) {
2223                 if (time_before(jiffies, expires))
2224                         expires -= jiffies;
2225                 else
2226                         expires = 0;
2227         }
2228
2229         if (rt_is_input_route(rt)) {
2230                 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2231                         goto nla_put_failure;
2232         }
2233
2234         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2235                 goto nla_put_failure;
2236
2237         return nlmsg_end(skb, nlh);
2238
2239 nla_put_failure:
2240         nlmsg_cancel(skb, nlh);
2241         return -EMSGSIZE;
2242 }
2243
2244 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2245 {
2246         struct net *net = sock_net(in_skb->sk);
2247         struct rtmsg *rtm;
2248         struct nlattr *tb[RTA_MAX+1];
2249         struct rtable *rt = NULL;
2250         struct flowi4 fl4;
2251         __be32 dst = 0;
2252         __be32 src = 0;
2253         u32 iif;
2254         int err;
2255         int mark;
2256         struct sk_buff *skb;
2257
2258         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2259         if (err < 0)
2260                 goto errout;
2261
2262         rtm = nlmsg_data(nlh);
2263
2264         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2265         if (skb == NULL) {
2266                 err = -ENOBUFS;
2267                 goto errout;
2268         }
2269
2270         /* Reserve room for dummy headers, this skb can pass
2271            through good chunk of routing engine.
2272          */
2273         skb_reset_mac_header(skb);
2274         skb_reset_network_header(skb);
2275
2276         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2277         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2278         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2279
2280         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2281         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2282         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2283         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2284
2285         memset(&fl4, 0, sizeof(fl4));
2286         fl4.daddr = dst;
2287         fl4.saddr = src;
2288         fl4.flowi4_tos = rtm->rtm_tos;
2289         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2290         fl4.flowi4_mark = mark;
2291
2292         if (iif) {
2293                 struct net_device *dev;
2294
2295                 dev = __dev_get_by_index(net, iif);
2296                 if (dev == NULL) {
2297                         err = -ENODEV;
2298                         goto errout_free;
2299                 }
2300
2301                 skb->protocol   = htons(ETH_P_IP);
2302                 skb->dev        = dev;
2303                 skb->mark       = mark;
2304                 local_bh_disable();
2305                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2306                 local_bh_enable();
2307
2308                 rt = skb_rtable(skb);
2309                 if (err == 0 && rt->dst.error)
2310                         err = -rt->dst.error;
2311         } else {
2312                 rt = ip_route_output_key(net, &fl4);
2313
2314                 err = 0;
2315                 if (IS_ERR(rt))
2316                         err = PTR_ERR(rt);
2317         }
2318
2319         if (err)
2320                 goto errout_free;
2321
2322         skb_dst_set(skb, &rt->dst);
2323         if (rtm->rtm_flags & RTM_F_NOTIFY)
2324                 rt->rt_flags |= RTCF_NOTIFY;
2325
2326         err = rt_fill_info(net, dst, src, &fl4, skb,
2327                            NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2328                            RTM_NEWROUTE, 0, 0);
2329         if (err <= 0)
2330                 goto errout_free;
2331
2332         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2333 errout:
2334         return err;
2335
2336 errout_free:
2337         kfree_skb(skb);
2338         goto errout;
2339 }
2340
2341 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2342 {
2343         return skb->len;
2344 }
2345
2346 void ip_rt_multicast_event(struct in_device *in_dev)
2347 {
2348         rt_cache_flush(dev_net(in_dev->dev), 0);
2349 }
2350
2351 #ifdef CONFIG_SYSCTL
2352 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2353                                         void __user *buffer,
2354                                         size_t *lenp, loff_t *ppos)
2355 {
2356         if (write) {
2357                 int flush_delay;
2358                 ctl_table ctl;
2359                 struct net *net;
2360
2361                 memcpy(&ctl, __ctl, sizeof(ctl));
2362                 ctl.data = &flush_delay;
2363                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2364
2365                 net = (struct net *)__ctl->extra1;
2366                 rt_cache_flush(net, flush_delay);
2367                 return 0;
2368         }
2369
2370         return -EINVAL;
2371 }
2372
2373 static ctl_table ipv4_route_table[] = {
2374         {
2375                 .procname       = "gc_thresh",
2376                 .data           = &ipv4_dst_ops.gc_thresh,
2377                 .maxlen         = sizeof(int),
2378                 .mode           = 0644,
2379                 .proc_handler   = proc_dointvec,
2380         },
2381         {
2382                 .procname       = "max_size",
2383                 .data           = &ip_rt_max_size,
2384                 .maxlen         = sizeof(int),
2385                 .mode           = 0644,
2386                 .proc_handler   = proc_dointvec,
2387         },
2388         {
2389                 /*  Deprecated. Use gc_min_interval_ms */
2390
2391                 .procname       = "gc_min_interval",
2392                 .data           = &ip_rt_gc_min_interval,
2393                 .maxlen         = sizeof(int),
2394                 .mode           = 0644,
2395                 .proc_handler   = proc_dointvec_jiffies,
2396         },
2397         {
2398                 .procname       = "gc_min_interval_ms",
2399                 .data           = &ip_rt_gc_min_interval,
2400                 .maxlen         = sizeof(int),
2401                 .mode           = 0644,
2402                 .proc_handler   = proc_dointvec_ms_jiffies,
2403         },
2404         {
2405                 .procname       = "gc_timeout",
2406                 .data           = &ip_rt_gc_timeout,
2407                 .maxlen         = sizeof(int),
2408                 .mode           = 0644,
2409                 .proc_handler   = proc_dointvec_jiffies,
2410         },
2411         {
2412                 .procname       = "gc_interval",
2413                 .data           = &ip_rt_gc_interval,
2414                 .maxlen         = sizeof(int),
2415                 .mode           = 0644,
2416                 .proc_handler   = proc_dointvec_jiffies,
2417         },
2418         {
2419                 .procname       = "redirect_load",
2420                 .data           = &ip_rt_redirect_load,
2421                 .maxlen         = sizeof(int),
2422                 .mode           = 0644,
2423                 .proc_handler   = proc_dointvec,
2424         },
2425         {
2426                 .procname       = "redirect_number",
2427                 .data           = &ip_rt_redirect_number,
2428                 .maxlen         = sizeof(int),
2429                 .mode           = 0644,
2430                 .proc_handler   = proc_dointvec,
2431         },
2432         {
2433                 .procname       = "redirect_silence",
2434                 .data           = &ip_rt_redirect_silence,
2435                 .maxlen         = sizeof(int),
2436                 .mode           = 0644,
2437                 .proc_handler   = proc_dointvec,
2438         },
2439         {
2440                 .procname       = "error_cost",
2441                 .data           = &ip_rt_error_cost,
2442                 .maxlen         = sizeof(int),
2443                 .mode           = 0644,
2444                 .proc_handler   = proc_dointvec,
2445         },
2446         {
2447                 .procname       = "error_burst",
2448                 .data           = &ip_rt_error_burst,
2449                 .maxlen         = sizeof(int),
2450                 .mode           = 0644,
2451                 .proc_handler   = proc_dointvec,
2452         },
2453         {
2454                 .procname       = "gc_elasticity",
2455                 .data           = &ip_rt_gc_elasticity,
2456                 .maxlen         = sizeof(int),
2457                 .mode           = 0644,
2458                 .proc_handler   = proc_dointvec,
2459         },
2460         {
2461                 .procname       = "mtu_expires",
2462                 .data           = &ip_rt_mtu_expires,
2463                 .maxlen         = sizeof(int),
2464                 .mode           = 0644,
2465                 .proc_handler   = proc_dointvec_jiffies,
2466         },
2467         {
2468                 .procname       = "min_pmtu",
2469                 .data           = &ip_rt_min_pmtu,
2470                 .maxlen         = sizeof(int),
2471                 .mode           = 0644,
2472                 .proc_handler   = proc_dointvec,
2473         },
2474         {
2475                 .procname       = "min_adv_mss",
2476                 .data           = &ip_rt_min_advmss,
2477                 .maxlen         = sizeof(int),
2478                 .mode           = 0644,
2479                 .proc_handler   = proc_dointvec,
2480         },
2481         { }
2482 };
2483
2484 static struct ctl_table ipv4_route_flush_table[] = {
2485         {
2486                 .procname       = "flush",
2487                 .maxlen         = sizeof(int),
2488                 .mode           = 0200,
2489                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2490         },
2491         { },
2492 };
2493
2494 static __net_init int sysctl_route_net_init(struct net *net)
2495 {
2496         struct ctl_table *tbl;
2497
2498         tbl = ipv4_route_flush_table;
2499         if (!net_eq(net, &init_net)) {
2500                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2501                 if (tbl == NULL)
2502                         goto err_dup;
2503         }
2504         tbl[0].extra1 = net;
2505
2506         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2507         if (net->ipv4.route_hdr == NULL)
2508                 goto err_reg;
2509         return 0;
2510
2511 err_reg:
2512         if (tbl != ipv4_route_flush_table)
2513                 kfree(tbl);
2514 err_dup:
2515         return -ENOMEM;
2516 }
2517
2518 static __net_exit void sysctl_route_net_exit(struct net *net)
2519 {
2520         struct ctl_table *tbl;
2521
2522         tbl = net->ipv4.route_hdr->ctl_table_arg;
2523         unregister_net_sysctl_table(net->ipv4.route_hdr);
2524         BUG_ON(tbl == ipv4_route_flush_table);
2525         kfree(tbl);
2526 }
2527
2528 static __net_initdata struct pernet_operations sysctl_route_ops = {
2529         .init = sysctl_route_net_init,
2530         .exit = sysctl_route_net_exit,
2531 };
2532 #endif
2533
2534 static __net_init int rt_genid_init(struct net *net)
2535 {
2536         get_random_bytes(&net->ipv4.rt_genid,
2537                          sizeof(net->ipv4.rt_genid));
2538         get_random_bytes(&net->ipv4.dev_addr_genid,
2539                          sizeof(net->ipv4.dev_addr_genid));
2540         return 0;
2541 }
2542
2543 static __net_initdata struct pernet_operations rt_genid_ops = {
2544         .init = rt_genid_init,
2545 };
2546
2547 static int __net_init ipv4_inetpeer_init(struct net *net)
2548 {
2549         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2550
2551         if (!bp)
2552                 return -ENOMEM;
2553         inet_peer_base_init(bp);
2554         net->ipv4.peers = bp;
2555         return 0;
2556 }
2557
2558 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2559 {
2560         struct inet_peer_base *bp = net->ipv4.peers;
2561
2562         net->ipv4.peers = NULL;
2563         inetpeer_invalidate_tree(bp);
2564         kfree(bp);
2565 }
2566
2567 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2568         .init   =       ipv4_inetpeer_init,
2569         .exit   =       ipv4_inetpeer_exit,
2570 };
2571
2572 #ifdef CONFIG_IP_ROUTE_CLASSID
2573 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2574 #endif /* CONFIG_IP_ROUTE_CLASSID */
2575
2576 int __init ip_rt_init(void)
2577 {
2578         int rc = 0;
2579
2580 #ifdef CONFIG_IP_ROUTE_CLASSID
2581         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2582         if (!ip_rt_acct)
2583                 panic("IP: failed to allocate ip_rt_acct\n");
2584 #endif
2585
2586         ipv4_dst_ops.kmem_cachep =
2587                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2588                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2589
2590         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2591
2592         if (dst_entries_init(&ipv4_dst_ops) < 0)
2593                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2594
2595         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2596                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2597
2598         ipv4_dst_ops.gc_thresh = ~0;
2599         ip_rt_max_size = INT_MAX;
2600
2601         devinet_init();
2602         ip_fib_init();
2603
2604         if (ip_rt_proc_init())
2605                 pr_err("Unable to create route proc files\n");
2606 #ifdef CONFIG_XFRM
2607         xfrm_init();
2608         xfrm4_init(ip_rt_max_size);
2609 #endif
2610         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2611
2612 #ifdef CONFIG_SYSCTL
2613         register_pernet_subsys(&sysctl_route_ops);
2614 #endif
2615         register_pernet_subsys(&rt_genid_ops);
2616         register_pernet_subsys(&ipv4_inetpeer_ops);
2617         return rc;
2618 }
2619
2620 #ifdef CONFIG_SYSCTL
2621 /*
2622  * We really need to sanitize the damn ipv4 init order, then all
2623  * this nonsense will go away.
2624  */
2625 void __init ip_static_sysctl_init(void)
2626 {
2627         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2628 }
2629 #endif