net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/net_namespace.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #include <linux/kmemleak.h>
 110 #endif
 111 #include <net/secure_seq.h>
 112
 113 #define RT_FL_TOS(oldflp4) \
 114         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 115
 116 #define IP_MAX_MTU      0xFFF0
 117
 118 #define RT_GC_TIMEOUT (300*HZ)
 119
 120 static int ip_rt_max_size;
 121 static int ip_rt_redirect_number __read_mostly  = 9;
 122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 124 static int ip_rt_error_cost __read_mostly       = HZ;
 125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 127 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 128 static int ip_rt_min_advmss __read_mostly       = 256;
 129
 130 /*
 131  *      Interface to generic destination cache.
 132  */
 133
 134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 135 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 136 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 137 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 138 static void              ipv4_link_failure(struct sk_buff *skb);
 139 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 140                                            struct sk_buff *skb, u32 mtu);
 141 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 142                                         struct sk_buff *skb);
 143 static void             ipv4_dst_destroy(struct dst_entry *dst);
 144
 145 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 146                             int how)
 147 {
 148 }
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159
 160 static struct dst_ops ipv4_dst_ops = {
 161         .family =               AF_INET,
 162         .protocol =             cpu_to_be16(ETH_P_IP),
 163         .check =                ipv4_dst_check,
 164         .default_advmss =       ipv4_default_advmss,
 165         .mtu =                  ipv4_mtu,
 166         .cow_metrics =          ipv4_cow_metrics,
 167         .destroy =              ipv4_dst_destroy,
 168         .ifdown =               ipv4_dst_ifdown,
 169         .negative_advice =      ipv4_negative_advice,
 170         .link_failure =         ipv4_link_failure,
 171         .update_pmtu =          ip_rt_update_pmtu,
 172         .redirect =             ip_do_redirect,
 173         .local_out =            __ip_local_out,
 174         .neigh_lookup =         ipv4_neigh_lookup,
 175 };
 176
 177 #define ECN_OR_COST(class)      TC_PRIO_##class
 178
 179 const __u8 ip_tos2prio[16] = {
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BESTEFFORT,
 183         ECN_OR_COST(BESTEFFORT),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_BULK,
 187         ECN_OR_COST(BULK),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE,
 191         ECN_OR_COST(INTERACTIVE),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK),
 194         TC_PRIO_INTERACTIVE_BULK,
 195         ECN_OR_COST(INTERACTIVE_BULK)
 196 };
 197 EXPORT_SYMBOL(ip_tos2prio);
 198
 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 201
 202 #ifdef CONFIG_PROC_FS
 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204 {
 205         if (*pos)
 206                 return NULL;
 207         return SEQ_START_TOKEN;
 208 }
 209
 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211 {
 212         ++*pos;
 213         return NULL;
 214 }
 215
 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217 {
 218 }
 219
 220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221 {
 222         if (v == SEQ_START_TOKEN)
 223                 seq_printf(seq, "%-127s\n",
 224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226                            "HHUptod\tSpecDst");
 227         return 0;
 228 }
 229
 230 static const struct seq_operations rt_cache_seq_ops = {
 231         .start  = rt_cache_seq_start,
 232         .next   = rt_cache_seq_next,
 233         .stop   = rt_cache_seq_stop,
 234         .show   = rt_cache_seq_show,
 235 };
 236
 237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238 {
 239         return seq_open(file, &rt_cache_seq_ops);
 240 }
 241
 242 static const struct file_operations rt_cache_seq_fops = {
 243         .owner   = THIS_MODULE,
 244         .open    = rt_cache_seq_open,
 245         .read    = seq_read,
 246         .llseek  = seq_lseek,
 247         .release = seq_release,
 248 };
 249
 250
 251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 252 {
 253         int cpu;
 254
 255         if (*pos == 0)
 256                 return SEQ_START_TOKEN;
 257
 258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 259                 if (!cpu_possible(cpu))
 260                         continue;
 261                 *pos = cpu+1;
 262                 return &per_cpu(rt_cache_stat, cpu);
 263         }
 264         return NULL;
 265 }
 266
 267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 268 {
 269         int cpu;
 270
 271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 272                 if (!cpu_possible(cpu))
 273                         continue;
 274                 *pos = cpu+1;
 275                 return &per_cpu(rt_cache_stat, cpu);
 276         }
 277         return NULL;
 278
 279 }
 280
 281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 282 {
 283
 284 }
 285
 286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 287 {
 288         struct rt_cache_stat *st = v;
 289
 290         if (v == SEQ_START_TOKEN) {
 291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 292                 return 0;
 293         }
 294
 295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 297                    dst_entries_get_slow(&ipv4_dst_ops),
 298                    st->in_hit,
 299                    st->in_slow_tot,
 300                    st->in_slow_mc,
 301                    st->in_no_route,
 302                    st->in_brd,
 303                    st->in_martian_dst,
 304                    st->in_martian_src,
 305
 306                    st->out_hit,
 307                    st->out_slow_tot,
 308                    st->out_slow_mc,
 309
 310                    st->gc_total,
 311                    st->gc_ignored,
 312                    st->gc_goal_miss,
 313                    st->gc_dst_overflow,
 314                    st->in_hlist_search,
 315                    st->out_hlist_search
 316                 );
 317         return 0;
 318 }
 319
 320 static const struct seq_operations rt_cpu_seq_ops = {
 321         .start  = rt_cpu_seq_start,
 322         .next   = rt_cpu_seq_next,
 323         .stop   = rt_cpu_seq_stop,
 324         .show   = rt_cpu_seq_show,
 325 };
 326
 327
 328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 329 {
 330         return seq_open(file, &rt_cpu_seq_ops);
 331 }
 332
 333 static const struct file_operations rt_cpu_seq_fops = {
 334         .owner   = THIS_MODULE,
 335         .open    = rt_cpu_seq_open,
 336         .read    = seq_read,
 337         .llseek  = seq_lseek,
 338         .release = seq_release,
 339 };
 340
 341 #ifdef CONFIG_IP_ROUTE_CLASSID
 342 static int rt_acct_proc_show(struct seq_file *m, void *v)
 343 {
 344         struct ip_rt_acct *dst, *src;
 345         unsigned int i, j;
 346
 347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 348         if (!dst)
 349                 return -ENOMEM;
 350
 351         for_each_possible_cpu(i) {
 352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 353                 for (j = 0; j < 256; j++) {
 354                         dst[j].o_bytes   += src[j].o_bytes;
 355                         dst[j].o_packets += src[j].o_packets;
 356                         dst[j].i_bytes   += src[j].i_bytes;
 357                         dst[j].i_packets += src[j].i_packets;
 358                 }
 359         }
 360
 361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 362         kfree(dst);
 363         return 0;
 364 }
 365
 366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 367 {
 368         return single_open(file, rt_acct_proc_show, NULL);
 369 }
 370
 371 static const struct file_operations rt_acct_proc_fops = {
 372         .owner          = THIS_MODULE,
 373         .open           = rt_acct_proc_open,
 374         .read           = seq_read,
 375         .llseek         = seq_lseek,
 376         .release        = single_release,
 377 };
 378 #endif
 379
 380 static int __net_init ip_rt_do_proc_init(struct net *net)
 381 {
 382         struct proc_dir_entry *pde;
 383
 384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 385                           &rt_cache_seq_fops);
 386         if (!pde)
 387                 goto err1;
 388
 389         pde = proc_create("rt_cache", S_IRUGO,
 390                           net->proc_net_stat, &rt_cpu_seq_fops);
 391         if (!pde)
 392                 goto err2;
 393
 394 #ifdef CONFIG_IP_ROUTE_CLASSID
 395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 396         if (!pde)
 397                 goto err3;
 398 #endif
 399         return 0;
 400
 401 #ifdef CONFIG_IP_ROUTE_CLASSID
 402 err3:
 403         remove_proc_entry("rt_cache", net->proc_net_stat);
 404 #endif
 405 err2:
 406         remove_proc_entry("rt_cache", net->proc_net);
 407 err1:
 408         return -ENOMEM;
 409 }
 410
 411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 412 {
 413         remove_proc_entry("rt_cache", net->proc_net_stat);
 414         remove_proc_entry("rt_cache", net->proc_net);
 415 #ifdef CONFIG_IP_ROUTE_CLASSID
 416         remove_proc_entry("rt_acct", net->proc_net);
 417 #endif
 418 }
 419
 420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 421         .init = ip_rt_do_proc_init,
 422         .exit = ip_rt_do_proc_exit,
 423 };
 424
 425 static int __init ip_rt_proc_init(void)
 426 {
 427         return register_pernet_subsys(&ip_rt_proc_ops);
 428 }
 429
 430 #else
 431 static inline int ip_rt_proc_init(void)
 432 {
 433         return 0;
 434 }
 435 #endif /* CONFIG_PROC_FS */
 436
 437 static inline bool rt_is_expired(const struct rtable *rth)
 438 {
 439         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 440 }
 441
 442 void rt_cache_flush(struct net *net)
 443 {
 444         rt_genid_bump(net);
 445 }
 446
 447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 448                                            struct sk_buff *skb,
 449                                            const void *daddr)
 450 {
 451         struct net_device *dev = dst->dev;
 452         const __be32 *pkey = daddr;
 453         const struct rtable *rt;
 454         struct neighbour *n;
 455
 456         rt = (const struct rtable *) dst;
 457         if (rt->rt_gateway)
 458                 pkey = (const __be32 *) &rt->rt_gateway;
 459         else if (skb)
 460                 pkey = &ip_hdr(skb)->daddr;
 461
 462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 463         if (n)
 464                 return n;
 465         return neigh_create(&arp_tbl, pkey, dev);
 466 }
 467
 468 #define IP_IDENTS_SZ 2048u
 469 struct ip_ident_bucket {
 470         atomic_t        id;
 471         u32             stamp32;
 472 };
 473
 474 static struct ip_ident_bucket *ip_idents __read_mostly;
 475
 476 /* In order to protect privacy, we add a perturbation to identifiers
 477  * if one generator is seldom used. This makes hard for an attacker
 478  * to infer how many packets were sent between two points in time.
 479  */
 480 u32 ip_idents_reserve(u32 hash, int segs)
 481 {
 482         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
 483         u32 old = ACCESS_ONCE(bucket->stamp32);
 484         u32 now = (u32)jiffies;
 485         u32 delta = 0;
 486
 487         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
 488                 u64 x = prandom_u32();
 489
 490                 x *= (now - old);
 491                 delta = (u32)(x >> 32);
 492         }
 493
 494         return atomic_add_return(segs + delta, &bucket->id) - segs;
 495 }
 496 EXPORT_SYMBOL(ip_idents_reserve);
 497
 498 void __ip_select_ident(struct iphdr *iph, int segs)
 499 {
 500         static u32 ip_idents_hashrnd __read_mostly;
 501         static bool hashrnd_initialized = false;
 502         u32 hash, id;
 503
 504         if (unlikely(!hashrnd_initialized)) {
 505                 hashrnd_initialized = true;
 506                 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 507         }
 508
 509         hash = jhash_3words((__force u32)iph->daddr,
 510                             (__force u32)iph->saddr,
 511                             iph->protocol,
 512                             ip_idents_hashrnd);
 513         id = ip_idents_reserve(hash, segs);
 514         iph->id = htons(id);
 515 }
 516 EXPORT_SYMBOL(__ip_select_ident);
 517
 518 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 519                              const struct iphdr *iph,
 520                              int oif, u8 tos,
 521                              u8 prot, u32 mark, int flow_flags)
 522 {
 523         if (sk) {
 524                 const struct inet_sock *inet = inet_sk(sk);
 525
 526                 oif = sk->sk_bound_dev_if;
 527                 mark = sk->sk_mark;
 528                 tos = RT_CONN_FLAGS(sk);
 529                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 530         }
 531         flowi4_init_output(fl4, oif, mark, tos,
 532                            RT_SCOPE_UNIVERSE, prot,
 533                            flow_flags,
 534                            iph->daddr, iph->saddr, 0, 0);
 535 }
 536
 537 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 538                                const struct sock *sk)
 539 {
 540         const struct iphdr *iph = ip_hdr(skb);
 541         int oif = skb->dev->ifindex;
 542         u8 tos = RT_TOS(iph->tos);
 543         u8 prot = iph->protocol;
 544         u32 mark = skb->mark;
 545
 546         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 547 }
 548
 549 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 550 {
 551         const struct inet_sock *inet = inet_sk(sk);
 552         const struct ip_options_rcu *inet_opt;
 553         __be32 daddr = inet->inet_daddr;
 554
 555         rcu_read_lock();
 556         inet_opt = rcu_dereference(inet->inet_opt);
 557         if (inet_opt && inet_opt->opt.srr)
 558                 daddr = inet_opt->opt.faddr;
 559         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 560                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 561                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 562                            inet_sk_flowi_flags(sk),
 563                            daddr, inet->inet_saddr, 0, 0);
 564         rcu_read_unlock();
 565 }
 566
 567 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 568                                  const struct sk_buff *skb)
 569 {
 570         if (skb)
 571                 build_skb_flow_key(fl4, skb, sk);
 572         else
 573                 build_sk_flow_key(fl4, sk);
 574 }
 575
 576 static inline void rt_free(struct rtable *rt)
 577 {
 578         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 579 }
 580
 581 static DEFINE_SPINLOCK(fnhe_lock);
 582
 583 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 584 {
 585         struct fib_nh_exception *fnhe, *oldest;
 586         struct rtable *orig;
 587
 588         oldest = rcu_dereference(hash->chain);
 589         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 590              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 591                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 592                         oldest = fnhe;
 593         }
 594         orig = rcu_dereference(oldest->fnhe_rth);
 595         if (orig) {
 596                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
 597                 rt_free(orig);
 598         }
 599         return oldest;
 600 }
 601
 602 static inline u32 fnhe_hashfun(__be32 daddr)
 603 {
 604         u32 hval;
 605
 606         hval = (__force u32) daddr;
 607         hval ^= (hval >> 11) ^ (hval >> 22);
 608
 609         return hval & (FNHE_HASH_SIZE - 1);
 610 }
 611
 612 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 613                                   u32 pmtu, unsigned long expires)
 614 {
 615         struct fnhe_hash_bucket *hash;
 616         struct fib_nh_exception *fnhe;
 617         int depth;
 618         u32 hval = fnhe_hashfun(daddr);
 619
 620         spin_lock_bh(&fnhe_lock);
 621
 622         hash = nh->nh_exceptions;
 623         if (!hash) {
 624                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 625                 if (!hash)
 626                         goto out_unlock;
 627                 nh->nh_exceptions = hash;
 628         }
 629
 630         hash += hval;
 631
 632         depth = 0;
 633         for (fnhe = rcu_dereference(hash->chain); fnhe;
 634              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 635                 if (fnhe->fnhe_daddr == daddr)
 636                         break;
 637                 depth++;
 638         }
 639
 640         if (fnhe) {
 641                 if (gw)
 642                         fnhe->fnhe_gw = gw;
 643                 if (pmtu) {
 644                         fnhe->fnhe_pmtu = pmtu;
 645                         fnhe->fnhe_expires = expires;
 646                 }
 647         } else {
 648                 if (depth > FNHE_RECLAIM_DEPTH)
 649                         fnhe = fnhe_oldest(hash);
 650                 else {
 651                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 652                         if (!fnhe)
 653                                 goto out_unlock;
 654
 655                         fnhe->fnhe_next = hash->chain;
 656                         rcu_assign_pointer(hash->chain, fnhe);
 657                 }
 658                 fnhe->fnhe_daddr = daddr;
 659                 fnhe->fnhe_gw = gw;
 660                 fnhe->fnhe_pmtu = pmtu;
 661                 fnhe->fnhe_expires = expires;
 662         }
 663
 664         fnhe->fnhe_stamp = jiffies;
 665
 666 out_unlock:
 667         spin_unlock_bh(&fnhe_lock);
 668         return;
 669 }
 670
 671 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 672                              bool kill_route)
 673 {
 674         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 675         __be32 old_gw = ip_hdr(skb)->saddr;
 676         struct net_device *dev = skb->dev;
 677         struct in_device *in_dev;
 678         struct fib_result res;
 679         struct neighbour *n;
 680         struct net *net;
 681
 682         switch (icmp_hdr(skb)->code & 7) {
 683         case ICMP_REDIR_NET:
 684         case ICMP_REDIR_NETTOS:
 685         case ICMP_REDIR_HOST:
 686         case ICMP_REDIR_HOSTTOS:
 687                 break;
 688
 689         default:
 690                 return;
 691         }
 692
 693         if (rt->rt_gateway != old_gw)
 694                 return;
 695
 696         in_dev = __in_dev_get_rcu(dev);
 697         if (!in_dev)
 698                 return;
 699
 700         net = dev_net(dev);
 701         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 702             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 703             ipv4_is_zeronet(new_gw))
 704                 goto reject_redirect;
 705
 706         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 707                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 708                         goto reject_redirect;
 709                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 710                         goto reject_redirect;
 711         } else {
 712                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 713                         goto reject_redirect;
 714         }
 715
 716         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 717         if (n) {
 718                 if (!(n->nud_state & NUD_VALID)) {
 719                         neigh_event_send(n, NULL);
 720                 } else {
 721                         if (fib_lookup(net, fl4, &res) == 0) {
 722                                 struct fib_nh *nh = &FIB_RES_NH(res);
 723
 724                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 725                                                       0, 0);
 726                         }
 727                         if (kill_route)
 728                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 729                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 730                 }
 731                 neigh_release(n);
 732         }
 733         return;
 734
 735 reject_redirect:
 736 #ifdef CONFIG_IP_ROUTE_VERBOSE
 737         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 738                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 739                 __be32 daddr = iph->daddr;
 740                 __be32 saddr = iph->saddr;
 741
 742                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 743                                      "  Advised path = %pI4 -> %pI4\n",
 744                                      &old_gw, dev->name, &new_gw,
 745                                      &saddr, &daddr);
 746         }
 747 #endif
 748         ;
 749 }
 750
 751 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 752 {
 753         struct rtable *rt;
 754         struct flowi4 fl4;
 755         const struct iphdr *iph = (const struct iphdr *) skb->data;
 756         int oif = skb->dev->ifindex;
 757         u8 tos = RT_TOS(iph->tos);
 758         u8 prot = iph->protocol;
 759         u32 mark = skb->mark;
 760
 761         rt = (struct rtable *) dst;
 762
 763         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 764         __ip_do_redirect(rt, skb, &fl4, true);
 765 }
 766
 767 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 768 {
 769         struct rtable *rt = (struct rtable *)dst;
 770         struct dst_entry *ret = dst;
 771
 772         if (rt) {
 773                 if (dst->obsolete > 0) {
 774                         ip_rt_put(rt);
 775                         ret = NULL;
 776                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 777                            rt->dst.expires) {
 778                         ip_rt_put(rt);
 779                         ret = NULL;
 780                 }
 781         }
 782         return ret;
 783 }
 784
 785 /*
 786  * Algorithm:
 787  *      1. The first ip_rt_redirect_number redirects are sent
 788  *         with exponential backoff, then we stop sending them at all,
 789  *         assuming that the host ignores our redirects.
 790  *      2. If we did not see packets requiring redirects
 791  *         during ip_rt_redirect_silence, we assume that the host
 792  *         forgot redirected route and start to send redirects again.
 793  *
 794  * This algorithm is much cheaper and more intelligent than dumb load limiting
 795  * in icmp.c.
 796  *
 797  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 798  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 799  */
 800
 801 void ip_rt_send_redirect(struct sk_buff *skb)
 802 {
 803         struct rtable *rt = skb_rtable(skb);
 804         struct in_device *in_dev;
 805         struct inet_peer *peer;
 806         struct net *net;
 807         int log_martians;
 808
 809         rcu_read_lock();
 810         in_dev = __in_dev_get_rcu(rt->dst.dev);
 811         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 812                 rcu_read_unlock();
 813                 return;
 814         }
 815         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 816         rcu_read_unlock();
 817
 818         net = dev_net(rt->dst.dev);
 819         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 820         if (!peer) {
 821                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 822                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 823                 return;
 824         }
 825
 826         /* No redirected packets during ip_rt_redirect_silence;
 827          * reset the algorithm.
 828          */
 829         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 830                 peer->rate_tokens = 0;
 831
 832         /* Too many ignored redirects; do not send anything
 833          * set dst.rate_last to the last seen redirected packet.
 834          */
 835         if (peer->rate_tokens >= ip_rt_redirect_number) {
 836                 peer->rate_last = jiffies;
 837                 goto out_put_peer;
 838         }
 839
 840         /* Check for load limit; set rate_last to the latest sent
 841          * redirect.
 842          */
 843         if (peer->rate_tokens == 0 ||
 844             time_after(jiffies,
 845                        (peer->rate_last +
 846                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 847                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 848
 849                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 850                 peer->rate_last = jiffies;
 851                 ++peer->rate_tokens;
 852 #ifdef CONFIG_IP_ROUTE_VERBOSE
 853                 if (log_martians &&
 854                     peer->rate_tokens == ip_rt_redirect_number)
 855                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 856                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 857                                              &ip_hdr(skb)->daddr, &gw);
 858 #endif
 859         }
 860 out_put_peer:
 861         inet_putpeer(peer);
 862 }
 863
 864 static int ip_error(struct sk_buff *skb)
 865 {
 866         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 867         struct rtable *rt = skb_rtable(skb);
 868         struct inet_peer *peer;
 869         unsigned long now;
 870         struct net *net;
 871         bool send;
 872         int code;
 873
 874         net = dev_net(rt->dst.dev);
 875         if (!IN_DEV_FORWARD(in_dev)) {
 876                 switch (rt->dst.error) {
 877                 case EHOSTUNREACH:
 878                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 879                         break;
 880
 881                 case ENETUNREACH:
 882                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 883                         break;
 884                 }
 885                 goto out;
 886         }
 887
 888         switch (rt->dst.error) {
 889         case EINVAL:
 890         default:
 891                 goto out;
 892         case EHOSTUNREACH:
 893                 code = ICMP_HOST_UNREACH;
 894                 break;
 895         case ENETUNREACH:
 896                 code = ICMP_NET_UNREACH;
 897                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 898                 break;
 899         case EACCES:
 900                 code = ICMP_PKT_FILTERED;
 901                 break;
 902         }
 903
 904         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 905
 906         send = true;
 907         if (peer) {
 908                 now = jiffies;
 909                 peer->rate_tokens += now - peer->rate_last;
 910                 if (peer->rate_tokens > ip_rt_error_burst)
 911                         peer->rate_tokens = ip_rt_error_burst;
 912                 peer->rate_last = now;
 913                 if (peer->rate_tokens >= ip_rt_error_cost)
 914                         peer->rate_tokens -= ip_rt_error_cost;
 915                 else
 916                         send = false;
 917                 inet_putpeer(peer);
 918         }
 919         if (send)
 920                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 921
 922 out:    kfree_skb(skb);
 923         return 0;
 924 }
 925
 926 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 927 {
 928         struct dst_entry *dst = &rt->dst;
 929         struct fib_result res;
 930
 931         if (dst_metric_locked(dst, RTAX_MTU))
 932                 return;
 933
 934         if (dst->dev->mtu < mtu)
 935                 return;
 936
 937         if (mtu < ip_rt_min_pmtu)
 938                 mtu = ip_rt_min_pmtu;
 939
 940         if (!rt->rt_pmtu) {
 941                 dst->obsolete = DST_OBSOLETE_KILL;
 942         } else {
 943                 rt->rt_pmtu = mtu;
 944                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
 945         }
 946
 947         rcu_read_lock();
 948         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
 949                 struct fib_nh *nh = &FIB_RES_NH(res);
 950
 951                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 952                                       jiffies + ip_rt_mtu_expires);
 953         }
 954         rcu_read_unlock();
 955 }
 956
 957 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 958                               struct sk_buff *skb, u32 mtu)
 959 {
 960         struct rtable *rt = (struct rtable *) dst;
 961         struct flowi4 fl4;
 962
 963         ip_rt_build_flow_key(&fl4, sk, skb);
 964         __ip_rt_update_pmtu(rt, &fl4, mtu);
 965 }
 966
 967 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 968                       int oif, u32 mark, u8 protocol, int flow_flags)
 969 {
 970         const struct iphdr *iph = (const struct iphdr *) skb->data;
 971         struct flowi4 fl4;
 972         struct rtable *rt;
 973
 974         __build_flow_key(&fl4, NULL, iph, oif,
 975                          RT_TOS(iph->tos), protocol, mark, flow_flags);
 976         rt = __ip_route_output_key(net, &fl4);
 977         if (!IS_ERR(rt)) {
 978                 __ip_rt_update_pmtu(rt, &fl4, mtu);
 979                 ip_rt_put(rt);
 980         }
 981 }
 982 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 983
 984 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 985 {
 986         const struct iphdr *iph = (const struct iphdr *) skb->data;
 987         struct flowi4 fl4;
 988         struct rtable *rt;
 989
 990         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 991         rt = __ip_route_output_key(sock_net(sk), &fl4);
 992         if (!IS_ERR(rt)) {
 993                 __ip_rt_update_pmtu(rt, &fl4, mtu);
 994                 ip_rt_put(rt);
 995         }
 996 }
 997
 998 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 999 {
1000         const struct iphdr *iph = (const struct iphdr *) skb->data;
1001         struct flowi4 fl4;
1002         struct rtable *rt;
1003         struct dst_entry *odst = NULL;
1004         bool new = false;
1005
1006         bh_lock_sock(sk);
1007         odst = sk_dst_get(sk);
1008
1009         if (sock_owned_by_user(sk) || !odst) {
1010                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1011                 goto out;
1012         }
1013
1014         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1015
1016         rt = (struct rtable *)odst;
1017         if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1018                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1019                 if (IS_ERR(rt))
1020                         goto out;
1021
1022                 new = true;
1023         }
1024
1025         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1026
1027         if (!dst_check(&rt->dst, 0)) {
1028                 if (new)
1029                         dst_release(&rt->dst);
1030
1031                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1032                 if (IS_ERR(rt))
1033                         goto out;
1034
1035                 new = true;
1036         }
1037
1038         if (new)
1039                 sk_dst_set(sk, &rt->dst);
1040
1041 out:
1042         bh_unlock_sock(sk);
1043         dst_release(odst);
1044 }
1045 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1046
1047 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1048                    int oif, u32 mark, u8 protocol, int flow_flags)
1049 {
1050         const struct iphdr *iph = (const struct iphdr *) skb->data;
1051         struct flowi4 fl4;
1052         struct rtable *rt;
1053
1054         __build_flow_key(&fl4, NULL, iph, oif,
1055                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1056         rt = __ip_route_output_key(net, &fl4);
1057         if (!IS_ERR(rt)) {
1058                 __ip_do_redirect(rt, skb, &fl4, false);
1059                 ip_rt_put(rt);
1060         }
1061 }
1062 EXPORT_SYMBOL_GPL(ipv4_redirect);
1063
1064 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1065 {
1066         const struct iphdr *iph = (const struct iphdr *) skb->data;
1067         struct flowi4 fl4;
1068         struct rtable *rt;
1069
1070         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1071         rt = __ip_route_output_key(sock_net(sk), &fl4);
1072         if (!IS_ERR(rt)) {
1073                 __ip_do_redirect(rt, skb, &fl4, false);
1074                 ip_rt_put(rt);
1075         }
1076 }
1077 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1078
1079 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1080 {
1081         struct rtable *rt = (struct rtable *) dst;
1082
1083         /* All IPV4 dsts are created with ->obsolete set to the value
1084          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1085          * into this function always.
1086          *
1087          * When a PMTU/redirect information update invalidates a
1088          * route, this is indicated by setting obsolete to
1089          * DST_OBSOLETE_KILL.
1090          */
1091         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1092                 return NULL;
1093         return dst;
1094 }
1095
1096 static void ipv4_link_failure(struct sk_buff *skb)
1097 {
1098         struct rtable *rt;
1099
1100         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1101
1102         rt = skb_rtable(skb);
1103         if (rt)
1104                 dst_set_expires(&rt->dst, 0);
1105 }
1106
1107 static int ip_rt_bug(struct sk_buff *skb)
1108 {
1109         pr_debug("%s: %pI4 -> %pI4, %s\n",
1110                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1111                  skb->dev ? skb->dev->name : "?");
1112         kfree_skb(skb);
1113         WARN_ON(1);
1114         return 0;
1115 }
1116
1117 /*
1118    We do not cache source address of outgoing interface,
1119    because it is used only by IP RR, TS and SRR options,
1120    so that it out of fast path.
1121
1122    BTW remember: "addr" is allowed to be not aligned
1123    in IP options!
1124  */
1125
1126 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1127 {
1128         __be32 src;
1129
1130         if (rt_is_output_route(rt))
1131                 src = ip_hdr(skb)->saddr;
1132         else {
1133                 struct fib_result res;
1134                 struct flowi4 fl4;
1135                 struct iphdr *iph;
1136
1137                 iph = ip_hdr(skb);
1138
1139                 memset(&fl4, 0, sizeof(fl4));
1140                 fl4.daddr = iph->daddr;
1141                 fl4.saddr = iph->saddr;
1142                 fl4.flowi4_tos = RT_TOS(iph->tos);
1143                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1144                 fl4.flowi4_iif = skb->dev->ifindex;
1145                 fl4.flowi4_mark = skb->mark;
1146
1147                 rcu_read_lock();
1148                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1149                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1150                 else
1151                         src = inet_select_addr(rt->dst.dev,
1152                                                rt_nexthop(rt, iph->daddr),
1153                                                RT_SCOPE_UNIVERSE);
1154                 rcu_read_unlock();
1155         }
1156         memcpy(addr, &src, 4);
1157 }
1158
1159 #ifdef CONFIG_IP_ROUTE_CLASSID
1160 static void set_class_tag(struct rtable *rt, u32 tag)
1161 {
1162         if (!(rt->dst.tclassid & 0xFFFF))
1163                 rt->dst.tclassid |= tag & 0xFFFF;
1164         if (!(rt->dst.tclassid & 0xFFFF0000))
1165                 rt->dst.tclassid |= tag & 0xFFFF0000;
1166 }
1167 #endif
1168
1169 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1170 {
1171         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1172
1173         if (advmss == 0) {
1174                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1175                                ip_rt_min_advmss);
1176                 if (advmss > 65535 - 40)
1177                         advmss = 65535 - 40;
1178         }
1179         return advmss;
1180 }
1181
1182 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1183 {
1184         const struct rtable *rt = (const struct rtable *) dst;
1185         unsigned int mtu = rt->rt_pmtu;
1186
1187         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1188                 mtu = dst_metric_raw(dst, RTAX_MTU);
1189
1190         if (mtu)
1191                 return mtu;
1192
1193         mtu = dst->dev->mtu;
1194
1195         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1196                 if (rt->rt_uses_gateway && mtu > 576)
1197                         mtu = 576;
1198         }
1199
1200         if (mtu > IP_MAX_MTU)
1201                 mtu = IP_MAX_MTU;
1202
1203         return mtu;
1204 }
1205
1206 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1207 {
1208         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1209         struct fib_nh_exception *fnhe;
1210         u32 hval;
1211
1212         if (!hash)
1213                 return NULL;
1214
1215         hval = fnhe_hashfun(daddr);
1216
1217         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1218              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1219                 if (fnhe->fnhe_daddr == daddr)
1220                         return fnhe;
1221         }
1222         return NULL;
1223 }
1224
1225 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1226                               __be32 daddr)
1227 {
1228         bool ret = false;
1229
1230         spin_lock_bh(&fnhe_lock);
1231
1232         if (daddr == fnhe->fnhe_daddr) {
1233                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1234                 if (orig && rt_is_expired(orig)) {
1235                         fnhe->fnhe_gw = 0;
1236                         fnhe->fnhe_pmtu = 0;
1237                         fnhe->fnhe_expires = 0;
1238                 }
1239                 if (fnhe->fnhe_pmtu) {
1240                         unsigned long expires = fnhe->fnhe_expires;
1241                         unsigned long diff = expires - jiffies;
1242
1243                         if (time_before(jiffies, expires)) {
1244                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1245                                 dst_set_expires(&rt->dst, diff);
1246                         }
1247                 }
1248                 if (fnhe->fnhe_gw) {
1249                         rt->rt_flags |= RTCF_REDIRECTED;
1250                         rt->rt_gateway = fnhe->fnhe_gw;
1251                         rt->rt_uses_gateway = 1;
1252                 } else if (!rt->rt_gateway)
1253                         rt->rt_gateway = daddr;
1254
1255                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1256                 if (orig)
1257                         rt_free(orig);
1258
1259                 fnhe->fnhe_stamp = jiffies;
1260                 ret = true;
1261         }
1262         spin_unlock_bh(&fnhe_lock);
1263
1264         return ret;
1265 }
1266
1267 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1268 {
1269         struct rtable *orig, *prev, **p;
1270         bool ret = true;
1271
1272         if (rt_is_input_route(rt)) {
1273                 p = (struct rtable **)&nh->nh_rth_input;
1274         } else {
1275                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1276         }
1277         orig = *p;
1278
1279         prev = cmpxchg(p, orig, rt);
1280         if (prev == orig) {
1281                 if (orig)
1282                         rt_free(orig);
1283         } else
1284                 ret = false;
1285
1286         return ret;
1287 }
1288
1289 static DEFINE_SPINLOCK(rt_uncached_lock);
1290 static LIST_HEAD(rt_uncached_list);
1291
1292 static void rt_add_uncached_list(struct rtable *rt)
1293 {
1294         spin_lock_bh(&rt_uncached_lock);
1295         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1296         spin_unlock_bh(&rt_uncached_lock);
1297 }
1298
1299 static void ipv4_dst_destroy(struct dst_entry *dst)
1300 {
1301         struct rtable *rt = (struct rtable *) dst;
1302
1303         if (!list_empty(&rt->rt_uncached)) {
1304                 spin_lock_bh(&rt_uncached_lock);
1305                 list_del(&rt->rt_uncached);
1306                 spin_unlock_bh(&rt_uncached_lock);
1307         }
1308 }
1309
1310 void rt_flush_dev(struct net_device *dev)
1311 {
1312         if (!list_empty(&rt_uncached_list)) {
1313                 struct net *net = dev_net(dev);
1314                 struct rtable *rt;
1315
1316                 spin_lock_bh(&rt_uncached_lock);
1317                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1318                         if (rt->dst.dev != dev)
1319                                 continue;
1320                         rt->dst.dev = net->loopback_dev;
1321                         dev_hold(rt->dst.dev);
1322                         dev_put(dev);
1323                 }
1324                 spin_unlock_bh(&rt_uncached_lock);
1325         }
1326 }
1327
1328 static bool rt_cache_valid(const struct rtable *rt)
1329 {
1330         return  rt &&
1331                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1332                 !rt_is_expired(rt);
1333 }
1334
1335 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1336                            const struct fib_result *res,
1337                            struct fib_nh_exception *fnhe,
1338                            struct fib_info *fi, u16 type, u32 itag)
1339 {
1340         bool cached = false;
1341
1342         if (fi) {
1343                 struct fib_nh *nh = &FIB_RES_NH(*res);
1344
1345                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1346                         rt->rt_gateway = nh->nh_gw;
1347                         rt->rt_uses_gateway = 1;
1348                 }
1349                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1350 #ifdef CONFIG_IP_ROUTE_CLASSID
1351                 rt->dst.tclassid = nh->nh_tclassid;
1352 #endif
1353                 if (unlikely(fnhe))
1354                         cached = rt_bind_exception(rt, fnhe, daddr);
1355                 else if (!(rt->dst.flags & DST_NOCACHE))
1356                         cached = rt_cache_route(nh, rt);
1357                 if (unlikely(!cached)) {
1358                         /* Routes we intend to cache in nexthop exception or
1359                          * FIB nexthop have the DST_NOCACHE bit clear.
1360                          * However, if we are unsuccessful at storing this
1361                          * route into the cache we really need to set it.
1362                          */
1363                         rt->dst.flags |= DST_NOCACHE;
1364                         if (!rt->rt_gateway)
1365                                 rt->rt_gateway = daddr;
1366                         rt_add_uncached_list(rt);
1367                 }
1368         } else
1369                 rt_add_uncached_list(rt);
1370
1371 #ifdef CONFIG_IP_ROUTE_CLASSID
1372 #ifdef CONFIG_IP_MULTIPLE_TABLES
1373         set_class_tag(rt, res->tclassid);
1374 #endif
1375         set_class_tag(rt, itag);
1376 #endif
1377 }
1378
1379 static struct rtable *rt_dst_alloc(struct net_device *dev,
1380                                    bool nopolicy, bool noxfrm, bool will_cache)
1381 {
1382         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1383                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1384                          (nopolicy ? DST_NOPOLICY : 0) |
1385                          (noxfrm ? DST_NOXFRM : 0));
1386 }
1387
1388 /* called in rcu_read_lock() section */
1389 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1390                                 u8 tos, struct net_device *dev, int our)
1391 {
1392         struct rtable *rth;
1393         struct in_device *in_dev = __in_dev_get_rcu(dev);
1394         u32 itag = 0;
1395         int err;
1396
1397         /* Primary sanity checks. */
1398
1399         if (in_dev == NULL)
1400                 return -EINVAL;
1401
1402         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1403             skb->protocol != htons(ETH_P_IP))
1404                 goto e_inval;
1405
1406         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1407                 if (ipv4_is_loopback(saddr))
1408                         goto e_inval;
1409
1410         if (ipv4_is_zeronet(saddr)) {
1411                 if (!ipv4_is_local_multicast(daddr))
1412                         goto e_inval;
1413         } else {
1414                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1415                                           in_dev, &itag);
1416                 if (err < 0)
1417                         goto e_err;
1418         }
1419         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1420                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1421         if (!rth)
1422                 goto e_nobufs;
1423
1424 #ifdef CONFIG_IP_ROUTE_CLASSID
1425         rth->dst.tclassid = itag;
1426 #endif
1427         rth->dst.output = ip_rt_bug;
1428
1429         rth->rt_genid   = rt_genid(dev_net(dev));
1430         rth->rt_flags   = RTCF_MULTICAST;
1431         rth->rt_type    = RTN_MULTICAST;
1432         rth->rt_is_input= 1;
1433         rth->rt_iif     = 0;
1434         rth->rt_pmtu    = 0;
1435         rth->rt_gateway = 0;
1436         rth->rt_uses_gateway = 0;
1437         INIT_LIST_HEAD(&rth->rt_uncached);
1438         if (our) {
1439                 rth->dst.input= ip_local_deliver;
1440                 rth->rt_flags |= RTCF_LOCAL;
1441         }
1442
1443 #ifdef CONFIG_IP_MROUTE
1444         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1445                 rth->dst.input = ip_mr_input;
1446 #endif
1447         RT_CACHE_STAT_INC(in_slow_mc);
1448
1449         skb_dst_set(skb, &rth->dst);
1450         return 0;
1451
1452 e_nobufs:
1453         return -ENOBUFS;
1454 e_inval:
1455         return -EINVAL;
1456 e_err:
1457         return err;
1458 }
1459
1460
1461 static void ip_handle_martian_source(struct net_device *dev,
1462                                      struct in_device *in_dev,
1463                                      struct sk_buff *skb,
1464                                      __be32 daddr,
1465                                      __be32 saddr)
1466 {
1467         RT_CACHE_STAT_INC(in_martian_src);
1468 #ifdef CONFIG_IP_ROUTE_VERBOSE
1469         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1470                 /*
1471                  *      RFC1812 recommendation, if source is martian,
1472                  *      the only hint is MAC header.
1473                  */
1474                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1475                         &daddr, &saddr, dev->name);
1476                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1477                         print_hex_dump(KERN_WARNING, "ll header: ",
1478                                        DUMP_PREFIX_OFFSET, 16, 1,
1479                                        skb_mac_header(skb),
1480                                        dev->hard_header_len, true);
1481                 }
1482         }
1483 #endif
1484 }
1485
1486 /* called in rcu_read_lock() section */
1487 static int __mkroute_input(struct sk_buff *skb,
1488                            const struct fib_result *res,
1489                            struct in_device *in_dev,
1490                            __be32 daddr, __be32 saddr, u32 tos)
1491 {
1492         struct rtable *rth;
1493         int err;
1494         struct in_device *out_dev;
1495         unsigned int flags = 0;
1496         bool do_cache;
1497         u32 itag = 0;
1498
1499         /* get a working reference to the output device */
1500         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1501         if (out_dev == NULL) {
1502                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1503                 return -EINVAL;
1504         }
1505
1506         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1507                                   in_dev->dev, in_dev, &itag);
1508         if (err < 0) {
1509                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1510                                          saddr);
1511
1512                 goto cleanup;
1513         }
1514
1515         do_cache = res->fi && !itag;
1516         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1517             skb->protocol == htons(ETH_P_IP) &&
1518             (IN_DEV_SHARED_MEDIA(out_dev) ||
1519              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1520                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1521
1522         if (skb->protocol != htons(ETH_P_IP)) {
1523                 /* Not IP (i.e. ARP). Do not create route, if it is
1524                  * invalid for proxy arp. DNAT routes are always valid.
1525                  *
1526                  * Proxy arp feature have been extended to allow, ARP
1527                  * replies back to the same interface, to support
1528                  * Private VLAN switch technologies. See arp.c.
1529                  */
1530                 if (out_dev == in_dev &&
1531                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1532                         err = -EINVAL;
1533                         goto cleanup;
1534                 }
1535         }
1536
1537         if (do_cache) {
1538                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1539                 if (rt_cache_valid(rth)) {
1540                         skb_dst_set_noref(skb, &rth->dst);
1541                         goto out;
1542                 }
1543         }
1544
1545         rth = rt_dst_alloc(out_dev->dev,
1546                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1547                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1548         if (!rth) {
1549                 err = -ENOBUFS;
1550                 goto cleanup;
1551         }
1552
1553         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1554         rth->rt_flags = flags;
1555         rth->rt_type = res->type;
1556         rth->rt_is_input = 1;
1557         rth->rt_iif     = 0;
1558         rth->rt_pmtu    = 0;
1559         rth->rt_gateway = 0;
1560         rth->rt_uses_gateway = 0;
1561         INIT_LIST_HEAD(&rth->rt_uncached);
1562         RT_CACHE_STAT_INC(in_slow_tot);
1563
1564         rth->dst.input = ip_forward;
1565         rth->dst.output = ip_output;
1566
1567         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1568         skb_dst_set(skb, &rth->dst);
1569 out:
1570         err = 0;
1571  cleanup:
1572         return err;
1573 }
1574
1575 static int ip_mkroute_input(struct sk_buff *skb,
1576                             struct fib_result *res,
1577                             const struct flowi4 *fl4,
1578                             struct in_device *in_dev,
1579                             __be32 daddr, __be32 saddr, u32 tos)
1580 {
1581 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1582         if (res->fi && res->fi->fib_nhs > 1)
1583                 fib_select_multipath(res);
1584 #endif
1585
1586         /* create a routing cache entry */
1587         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1588 }
1589
1590 /*
1591  *      NOTE. We drop all the packets that has local source
1592  *      addresses, because every properly looped back packet
1593  *      must have correct destination already attached by output routine.
1594  *
1595  *      Such approach solves two big problems:
1596  *      1. Not simplex devices are handled properly.
1597  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1598  *      called with rcu_read_lock()
1599  */
1600
1601 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1602                                u8 tos, struct net_device *dev)
1603 {
1604         struct fib_result res;
1605         struct in_device *in_dev = __in_dev_get_rcu(dev);
1606         struct flowi4   fl4;
1607         unsigned int    flags = 0;
1608         u32             itag = 0;
1609         struct rtable   *rth;
1610         int             err = -EINVAL;
1611         struct net    *net = dev_net(dev);
1612         bool do_cache;
1613
1614         /* IP on this device is disabled. */
1615
1616         if (!in_dev)
1617                 goto out;
1618
1619         /* Check for the most weird martians, which can be not detected
1620            by fib_lookup.
1621          */
1622
1623         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1624                 goto martian_source;
1625
1626         res.fi = NULL;
1627         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1628                 goto brd_input;
1629
1630         /* Accept zero addresses only to limited broadcast;
1631          * I even do not know to fix it or not. Waiting for complains :-)
1632          */
1633         if (ipv4_is_zeronet(saddr))
1634                 goto martian_source;
1635
1636         if (ipv4_is_zeronet(daddr))
1637                 goto martian_destination;
1638
1639         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1640          * and call it once if daddr or/and saddr are loopback addresses
1641          */
1642         if (ipv4_is_loopback(daddr)) {
1643                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1644                         goto martian_destination;
1645         } else if (ipv4_is_loopback(saddr)) {
1646                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1647                         goto martian_source;
1648         }
1649
1650         /*
1651          *      Now we are ready to route packet.
1652          */
1653         fl4.flowi4_oif = 0;
1654         fl4.flowi4_iif = dev->ifindex;
1655         fl4.flowi4_mark = skb->mark;
1656         fl4.flowi4_tos = tos;
1657         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1658         fl4.daddr = daddr;
1659         fl4.saddr = saddr;
1660         err = fib_lookup(net, &fl4, &res);
1661         if (err != 0)
1662                 goto no_route;
1663
1664         if (res.type == RTN_BROADCAST)
1665                 goto brd_input;
1666
1667         if (res.type == RTN_LOCAL) {
1668                 err = fib_validate_source(skb, saddr, daddr, tos,
1669                                           LOOPBACK_IFINDEX,
1670                                           dev, in_dev, &itag);
1671                 if (err < 0)
1672                         goto martian_source_keep_err;
1673                 goto local_input;
1674         }
1675
1676         if (!IN_DEV_FORWARD(in_dev))
1677                 goto no_route;
1678         if (res.type != RTN_UNICAST)
1679                 goto martian_destination;
1680
1681         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1682 out:    return err;
1683
1684 brd_input:
1685         if (skb->protocol != htons(ETH_P_IP))
1686                 goto e_inval;
1687
1688         if (!ipv4_is_zeronet(saddr)) {
1689                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1690                                           in_dev, &itag);
1691                 if (err < 0)
1692                         goto martian_source_keep_err;
1693         }
1694         flags |= RTCF_BROADCAST;
1695         res.type = RTN_BROADCAST;
1696         RT_CACHE_STAT_INC(in_brd);
1697
1698 local_input:
1699         do_cache = false;
1700         if (res.fi) {
1701                 if (!itag) {
1702                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1703                         if (rt_cache_valid(rth)) {
1704                                 skb_dst_set_noref(skb, &rth->dst);
1705                                 err = 0;
1706                                 goto out;
1707                         }
1708                         do_cache = true;
1709                 }
1710         }
1711
1712         rth = rt_dst_alloc(net->loopback_dev,
1713                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1714         if (!rth)
1715                 goto e_nobufs;
1716
1717         rth->dst.input= ip_local_deliver;
1718         rth->dst.output= ip_rt_bug;
1719 #ifdef CONFIG_IP_ROUTE_CLASSID
1720         rth->dst.tclassid = itag;
1721 #endif
1722
1723         rth->rt_genid = rt_genid(net);
1724         rth->rt_flags   = flags|RTCF_LOCAL;
1725         rth->rt_type    = res.type;
1726         rth->rt_is_input = 1;
1727         rth->rt_iif     = 0;
1728         rth->rt_pmtu    = 0;
1729         rth->rt_gateway = 0;
1730         rth->rt_uses_gateway = 0;
1731         INIT_LIST_HEAD(&rth->rt_uncached);
1732         RT_CACHE_STAT_INC(in_slow_tot);
1733         if (res.type == RTN_UNREACHABLE) {
1734                 rth->dst.input= ip_error;
1735                 rth->dst.error= -err;
1736                 rth->rt_flags   &= ~RTCF_LOCAL;
1737         }
1738         if (do_cache) {
1739                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1740                         rth->dst.flags |= DST_NOCACHE;
1741                         rt_add_uncached_list(rth);
1742                 }
1743         }
1744         skb_dst_set(skb, &rth->dst);
1745         err = 0;
1746         goto out;
1747
1748 no_route:
1749         RT_CACHE_STAT_INC(in_no_route);
1750         res.type = RTN_UNREACHABLE;
1751         if (err == -ESRCH)
1752                 err = -ENETUNREACH;
1753         goto local_input;
1754
1755         /*
1756          *      Do not cache martian addresses: they should be logged (RFC1812)
1757          */
1758 martian_destination:
1759         RT_CACHE_STAT_INC(in_martian_dst);
1760 #ifdef CONFIG_IP_ROUTE_VERBOSE
1761         if (IN_DEV_LOG_MARTIANS(in_dev))
1762                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1763                                      &daddr, &saddr, dev->name);
1764 #endif
1765
1766 e_inval:
1767         err = -EINVAL;
1768         goto out;
1769
1770 e_nobufs:
1771         err = -ENOBUFS;
1772         goto out;
1773
1774 martian_source:
1775         err = -EINVAL;
1776 martian_source_keep_err:
1777         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1778         goto out;
1779 }
1780
1781 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1782                          u8 tos, struct net_device *dev)
1783 {
1784         int res;
1785
1786         rcu_read_lock();
1787
1788         /* Multicast recognition logic is moved from route cache to here.
1789            The problem was that too many Ethernet cards have broken/missing
1790            hardware multicast filters :-( As result the host on multicasting
1791            network acquires a lot of useless route cache entries, sort of
1792            SDR messages from all the world. Now we try to get rid of them.
1793            Really, provided software IP multicast filter is organized
1794            reasonably (at least, hashed), it does not result in a slowdown
1795            comparing with route cache reject entries.
1796            Note, that multicast routers are not affected, because
1797            route cache entry is created eventually.
1798          */
1799         if (ipv4_is_multicast(daddr)) {
1800                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1801
1802                 if (in_dev) {
1803                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1804                                                   ip_hdr(skb)->protocol);
1805                         if (our
1806 #ifdef CONFIG_IP_MROUTE
1807                                 ||
1808                             (!ipv4_is_local_multicast(daddr) &&
1809                              IN_DEV_MFORWARD(in_dev))
1810 #endif
1811                            ) {
1812                                 int res = ip_route_input_mc(skb, daddr, saddr,
1813                                                             tos, dev, our);
1814                                 rcu_read_unlock();
1815                                 return res;
1816                         }
1817                 }
1818                 rcu_read_unlock();
1819                 return -EINVAL;
1820         }
1821         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1822         rcu_read_unlock();
1823         return res;
1824 }
1825 EXPORT_SYMBOL(ip_route_input_noref);
1826
1827 /* called with rcu_read_lock() */
1828 static struct rtable *__mkroute_output(const struct fib_result *res,
1829                                        const struct flowi4 *fl4, int orig_oif,
1830                                        struct net_device *dev_out,
1831                                        unsigned int flags)
1832 {
1833         struct fib_info *fi = res->fi;
1834         struct fib_nh_exception *fnhe;
1835         struct in_device *in_dev;
1836         u16 type = res->type;
1837         struct rtable *rth;
1838         bool do_cache;
1839
1840         in_dev = __in_dev_get_rcu(dev_out);
1841         if (!in_dev)
1842                 return ERR_PTR(-EINVAL);
1843
1844         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1845                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1846                         return ERR_PTR(-EINVAL);
1847
1848         if (ipv4_is_lbcast(fl4->daddr))
1849                 type = RTN_BROADCAST;
1850         else if (ipv4_is_multicast(fl4->daddr))
1851                 type = RTN_MULTICAST;
1852         else if (ipv4_is_zeronet(fl4->daddr))
1853                 return ERR_PTR(-EINVAL);
1854
1855         if (dev_out->flags & IFF_LOOPBACK)
1856                 flags |= RTCF_LOCAL;
1857
1858         do_cache = true;
1859         if (type == RTN_BROADCAST) {
1860                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1861                 fi = NULL;
1862         } else if (type == RTN_MULTICAST) {
1863                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1864                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1865                                      fl4->flowi4_proto))
1866                         flags &= ~RTCF_LOCAL;
1867                 else
1868                         do_cache = false;
1869                 /* If multicast route do not exist use
1870                  * default one, but do not gateway in this case.
1871                  * Yes, it is hack.
1872                  */
1873                 if (fi && res->prefixlen < 4)
1874                         fi = NULL;
1875         }
1876
1877         fnhe = NULL;
1878         do_cache &= fi != NULL;
1879         if (do_cache) {
1880                 struct rtable __rcu **prth;
1881                 struct fib_nh *nh = &FIB_RES_NH(*res);
1882
1883                 fnhe = find_exception(nh, fl4->daddr);
1884                 if (fnhe)
1885                         prth = &fnhe->fnhe_rth;
1886                 else {
1887                         if (unlikely(fl4->flowi4_flags &
1888                                      FLOWI_FLAG_KNOWN_NH &&
1889                                      !(nh->nh_gw &&
1890                                        nh->nh_scope == RT_SCOPE_LINK))) {
1891                                 do_cache = false;
1892                                 goto add;
1893                         }
1894                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1895                 }
1896                 rth = rcu_dereference(*prth);
1897                 if (rt_cache_valid(rth)) {
1898                         dst_hold(&rth->dst);
1899                         return rth;
1900                 }
1901         }
1902
1903 add:
1904         rth = rt_dst_alloc(dev_out,
1905                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1906                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1907                            do_cache);
1908         if (!rth)
1909                 return ERR_PTR(-ENOBUFS);
1910
1911         rth->dst.output = ip_output;
1912
1913         rth->rt_genid = rt_genid(dev_net(dev_out));
1914         rth->rt_flags   = flags;
1915         rth->rt_type    = type;
1916         rth->rt_is_input = 0;
1917         rth->rt_iif     = orig_oif ? : 0;
1918         rth->rt_pmtu    = 0;
1919         rth->rt_gateway = 0;
1920         rth->rt_uses_gateway = 0;
1921         INIT_LIST_HEAD(&rth->rt_uncached);
1922
1923         RT_CACHE_STAT_INC(out_slow_tot);
1924
1925         if (flags & RTCF_LOCAL)
1926                 rth->dst.input = ip_local_deliver;
1927         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1928                 if (flags & RTCF_LOCAL &&
1929                     !(dev_out->flags & IFF_LOOPBACK)) {
1930                         rth->dst.output = ip_mc_output;
1931                         RT_CACHE_STAT_INC(out_slow_mc);
1932                 }
1933 #ifdef CONFIG_IP_MROUTE
1934                 if (type == RTN_MULTICAST) {
1935                         if (IN_DEV_MFORWARD(in_dev) &&
1936                             !ipv4_is_local_multicast(fl4->daddr)) {
1937                                 rth->dst.input = ip_mr_input;
1938                                 rth->dst.output = ip_mc_output;
1939                         }
1940                 }
1941 #endif
1942         }
1943
1944         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1945
1946         return rth;
1947 }
1948
1949 /*
1950  * Major route resolver routine.
1951  */
1952
1953 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1954 {
1955         struct net_device *dev_out = NULL;
1956         __u8 tos = RT_FL_TOS(fl4);
1957         unsigned int flags = 0;
1958         struct fib_result res;
1959         struct rtable *rth;
1960         int orig_oif;
1961
1962         res.tclassid    = 0;
1963         res.fi          = NULL;
1964         res.table       = NULL;
1965
1966         orig_oif = fl4->flowi4_oif;
1967
1968         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1969         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1970         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1971                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1972
1973         rcu_read_lock();
1974         if (fl4->saddr) {
1975                 rth = ERR_PTR(-EINVAL);
1976                 if (ipv4_is_multicast(fl4->saddr) ||
1977                     ipv4_is_lbcast(fl4->saddr) ||
1978                     ipv4_is_zeronet(fl4->saddr))
1979                         goto out;
1980
1981                 /* I removed check for oif == dev_out->oif here.
1982                    It was wrong for two reasons:
1983                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1984                       is assigned to multiple interfaces.
1985                    2. Moreover, we are allowed to send packets with saddr
1986                       of another iface. --ANK
1987                  */
1988
1989                 if (fl4->flowi4_oif == 0 &&
1990                     (ipv4_is_multicast(fl4->daddr) ||
1991                      ipv4_is_lbcast(fl4->daddr))) {
1992                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1993                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1994                         if (dev_out == NULL)
1995                                 goto out;
1996
1997                         /* Special hack: user can direct multicasts
1998                            and limited broadcast via necessary interface
1999                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2000                            This hack is not just for fun, it allows
2001                            vic,vat and friends to work.
2002                            They bind socket to loopback, set ttl to zero
2003                            and expect that it will work.
2004                            From the viewpoint of routing cache they are broken,
2005                            because we are not allowed to build multicast path
2006                            with loopback source addr (look, routing cache
2007                            cannot know, that ttl is zero, so that packet
2008                            will not leave this host and route is valid).
2009                            Luckily, this hack is good workaround.
2010                          */
2011
2012                         fl4->flowi4_oif = dev_out->ifindex;
2013                         goto make_route;
2014                 }
2015
2016                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2017                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2018                         if (!__ip_dev_find(net, fl4->saddr, false))
2019                                 goto out;
2020                 }
2021         }
2022
2023
2024         if (fl4->flowi4_oif) {
2025                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2026                 rth = ERR_PTR(-ENODEV);
2027                 if (dev_out == NULL)
2028                         goto out;
2029
2030                 /* RACE: Check return value of inet_select_addr instead. */
2031                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2032                         rth = ERR_PTR(-ENETUNREACH);
2033                         goto out;
2034                 }
2035                 if (ipv4_is_local_multicast(fl4->daddr) ||
2036                     ipv4_is_lbcast(fl4->daddr)) {
2037                         if (!fl4->saddr)
2038                                 fl4->saddr = inet_select_addr(dev_out, 0,
2039                                                               RT_SCOPE_LINK);
2040                         goto make_route;
2041                 }
2042                 if (!fl4->saddr) {
2043                         if (ipv4_is_multicast(fl4->daddr))
2044                                 fl4->saddr = inet_select_addr(dev_out, 0,
2045                                                               fl4->flowi4_scope);
2046                         else if (!fl4->daddr)
2047                                 fl4->saddr = inet_select_addr(dev_out, 0,
2048                                                               RT_SCOPE_HOST);
2049                 }
2050         }
2051
2052         if (!fl4->daddr) {
2053                 fl4->daddr = fl4->saddr;
2054                 if (!fl4->daddr)
2055                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2056                 dev_out = net->loopback_dev;
2057                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2058                 res.type = RTN_LOCAL;
2059                 flags |= RTCF_LOCAL;
2060                 goto make_route;
2061         }
2062
2063         if (fib_lookup(net, fl4, &res)) {
2064                 res.fi = NULL;
2065                 res.table = NULL;
2066                 if (fl4->flowi4_oif) {
2067                         /* Apparently, routing tables are wrong. Assume,
2068                            that the destination is on link.
2069
2070                            WHY? DW.
2071                            Because we are allowed to send to iface
2072                            even if it has NO routes and NO assigned
2073                            addresses. When oif is specified, routing
2074                            tables are looked up with only one purpose:
2075                            to catch if destination is gatewayed, rather than
2076                            direct. Moreover, if MSG_DONTROUTE is set,
2077                            we send packet, ignoring both routing tables
2078                            and ifaddr state. --ANK
2079
2080
2081                            We could make it even if oif is unknown,
2082                            likely IPv6, but we do not.
2083                          */
2084
2085                         if (fl4->saddr == 0)
2086                                 fl4->saddr = inet_select_addr(dev_out, 0,
2087                                                               RT_SCOPE_LINK);
2088                         res.type = RTN_UNICAST;
2089                         goto make_route;
2090                 }
2091                 rth = ERR_PTR(-ENETUNREACH);
2092                 goto out;
2093         }
2094
2095         if (res.type == RTN_LOCAL) {
2096                 if (!fl4->saddr) {
2097                         if (res.fi->fib_prefsrc)
2098                                 fl4->saddr = res.fi->fib_prefsrc;
2099                         else
2100                                 fl4->saddr = fl4->daddr;
2101                 }
2102                 dev_out = net->loopback_dev;
2103                 fl4->flowi4_oif = dev_out->ifindex;
2104                 flags |= RTCF_LOCAL;
2105                 goto make_route;
2106         }
2107
2108 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2109         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2110                 fib_select_multipath(&res);
2111         else
2112 #endif
2113         if (!res.prefixlen &&
2114             res.table->tb_num_default > 1 &&
2115             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2116                 fib_select_default(&res);
2117
2118         if (!fl4->saddr)
2119                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2120
2121         dev_out = FIB_RES_DEV(res);
2122         fl4->flowi4_oif = dev_out->ifindex;
2123
2124
2125 make_route:
2126         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2127
2128 out:
2129         rcu_read_unlock();
2130         return rth;
2131 }
2132 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2133
2134 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2135 {
2136         return NULL;
2137 }
2138
2139 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2140 {
2141         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2142
2143         return mtu ? : dst->dev->mtu;
2144 }
2145
2146 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2147                                           struct sk_buff *skb, u32 mtu)
2148 {
2149 }
2150
2151 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2152                                        struct sk_buff *skb)
2153 {
2154 }
2155
2156 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2157                                           unsigned long old)
2158 {
2159         return NULL;
2160 }
2161
2162 static struct dst_ops ipv4_dst_blackhole_ops = {
2163         .family                 =       AF_INET,
2164         .protocol               =       cpu_to_be16(ETH_P_IP),
2165         .check                  =       ipv4_blackhole_dst_check,
2166         .mtu                    =       ipv4_blackhole_mtu,
2167         .default_advmss         =       ipv4_default_advmss,
2168         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2169         .redirect               =       ipv4_rt_blackhole_redirect,
2170         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2171         .neigh_lookup           =       ipv4_neigh_lookup,
2172 };
2173
2174 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2175 {
2176         struct rtable *ort = (struct rtable *) dst_orig;
2177         struct rtable *rt;
2178
2179         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2180         if (rt) {
2181                 struct dst_entry *new = &rt->dst;
2182
2183                 new->__use = 1;
2184                 new->input = dst_discard;
2185                 new->output = dst_discard;
2186
2187                 new->dev = ort->dst.dev;
2188                 if (new->dev)
2189                         dev_hold(new->dev);
2190
2191                 rt->rt_is_input = ort->rt_is_input;
2192                 rt->rt_iif = ort->rt_iif;
2193                 rt->rt_pmtu = ort->rt_pmtu;
2194
2195                 rt->rt_genid = rt_genid(net);
2196                 rt->rt_flags = ort->rt_flags;
2197                 rt->rt_type = ort->rt_type;
2198                 rt->rt_gateway = ort->rt_gateway;
2199                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2200
2201                 INIT_LIST_HEAD(&rt->rt_uncached);
2202
2203                 dst_free(new);
2204         }
2205
2206         dst_release(dst_orig);
2207
2208         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2209 }
2210
2211 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2212                                     struct sock *sk)
2213 {
2214         struct rtable *rt = __ip_route_output_key(net, flp4);
2215
2216         if (IS_ERR(rt))
2217                 return rt;
2218
2219         if (flp4->flowi4_proto)
2220                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2221                                                    flowi4_to_flowi(flp4),
2222                                                    sk, 0);
2223
2224         return rt;
2225 }
2226 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2227
2228 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2229                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2230                         u32 seq, int event, int nowait, unsigned int flags)
2231 {
2232         struct rtable *rt = skb_rtable(skb);
2233         struct rtmsg *r;
2234         struct nlmsghdr *nlh;
2235         unsigned long expires = 0;
2236         u32 error;
2237         u32 metrics[RTAX_MAX];
2238
2239         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2240         if (nlh == NULL)
2241                 return -EMSGSIZE;
2242
2243         r = nlmsg_data(nlh);
2244         r->rtm_family    = AF_INET;
2245         r->rtm_dst_len  = 32;
2246         r->rtm_src_len  = 0;
2247         r->rtm_tos      = fl4->flowi4_tos;
2248         r->rtm_table    = RT_TABLE_MAIN;
2249         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2250                 goto nla_put_failure;
2251         r->rtm_type     = rt->rt_type;
2252         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2253         r->rtm_protocol = RTPROT_UNSPEC;
2254         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2255         if (rt->rt_flags & RTCF_NOTIFY)
2256                 r->rtm_flags |= RTM_F_NOTIFY;
2257         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2258                 r->rtm_flags |= RTCF_DOREDIRECT;
2259
2260         if (nla_put_be32(skb, RTA_DST, dst))
2261                 goto nla_put_failure;
2262         if (src) {
2263                 r->rtm_src_len = 32;
2264                 if (nla_put_be32(skb, RTA_SRC, src))
2265                         goto nla_put_failure;
2266         }
2267         if (rt->dst.dev &&
2268             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2269                 goto nla_put_failure;
2270 #ifdef CONFIG_IP_ROUTE_CLASSID
2271         if (rt->dst.tclassid &&
2272             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2273                 goto nla_put_failure;
2274 #endif
2275         if (!rt_is_input_route(rt) &&
2276             fl4->saddr != src) {
2277                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2278                         goto nla_put_failure;
2279         }
2280         if (rt->rt_uses_gateway &&
2281             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2282                 goto nla_put_failure;
2283
2284         expires = rt->dst.expires;
2285         if (expires) {
2286                 unsigned long now = jiffies;
2287
2288                 if (time_before(now, expires))
2289                         expires -= now;
2290                 else
2291                         expires = 0;
2292         }
2293
2294         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2295         if (rt->rt_pmtu && expires)
2296                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2297         if (rtnetlink_put_metrics(skb, metrics) < 0)
2298                 goto nla_put_failure;
2299
2300         if (fl4->flowi4_mark &&
2301             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2302                 goto nla_put_failure;
2303
2304         error = rt->dst.error;
2305
2306         if (rt_is_input_route(rt)) {
2307 #ifdef CONFIG_IP_MROUTE
2308                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2309                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2310                         int err = ipmr_get_route(net, skb,
2311                                                  fl4->saddr, fl4->daddr,
2312                                                  r, nowait);
2313                         if (err <= 0) {
2314                                 if (!nowait) {
2315                                         if (err == 0)
2316                                                 return 0;
2317                                         goto nla_put_failure;
2318                                 } else {
2319                                         if (err == -EMSGSIZE)
2320                                                 goto nla_put_failure;
2321                                         error = err;
2322                                 }
2323                         }
2324                 } else
2325 #endif
2326                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2327                                 goto nla_put_failure;
2328         }
2329
2330         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2331                 goto nla_put_failure;
2332
2333         return nlmsg_end(skb, nlh);
2334
2335 nla_put_failure:
2336         nlmsg_cancel(skb, nlh);
2337         return -EMSGSIZE;
2338 }
2339
2340 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2341 {
2342         struct net *net = sock_net(in_skb->sk);
2343         struct rtmsg *rtm;
2344         struct nlattr *tb[RTA_MAX+1];
2345         struct rtable *rt = NULL;
2346         struct flowi4 fl4;
2347         __be32 dst = 0;
2348         __be32 src = 0;
2349         u32 iif;
2350         int err;
2351         int mark;
2352         struct sk_buff *skb;
2353
2354         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2355         if (err < 0)
2356                 goto errout;
2357
2358         rtm = nlmsg_data(nlh);
2359
2360         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2361         if (skb == NULL) {
2362                 err = -ENOBUFS;
2363                 goto errout;
2364         }
2365
2366         /* Reserve room for dummy headers, this skb can pass
2367            through good chunk of routing engine.
2368          */
2369         skb_reset_mac_header(skb);
2370         skb_reset_network_header(skb);
2371
2372         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2373         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2374         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2375
2376         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2377         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2378         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2379         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2380
2381         memset(&fl4, 0, sizeof(fl4));
2382         fl4.daddr = dst;
2383         fl4.saddr = src;
2384         fl4.flowi4_tos = rtm->rtm_tos;
2385         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2386         fl4.flowi4_mark = mark;
2387
2388         if (iif) {
2389                 struct net_device *dev;
2390
2391                 dev = __dev_get_by_index(net, iif);
2392                 if (dev == NULL) {
2393                         err = -ENODEV;
2394                         goto errout_free;
2395                 }
2396
2397                 skb->protocol   = htons(ETH_P_IP);
2398                 skb->dev        = dev;
2399                 skb->mark       = mark;
2400                 local_bh_disable();
2401                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2402                 local_bh_enable();
2403
2404                 rt = skb_rtable(skb);
2405                 if (err == 0 && rt->dst.error)
2406                         err = -rt->dst.error;
2407         } else {
2408                 rt = ip_route_output_key(net, &fl4);
2409
2410                 err = 0;
2411                 if (IS_ERR(rt))
2412                         err = PTR_ERR(rt);
2413         }
2414
2415         if (err)
2416                 goto errout_free;
2417
2418         skb_dst_set(skb, &rt->dst);
2419         if (rtm->rtm_flags & RTM_F_NOTIFY)
2420                 rt->rt_flags |= RTCF_NOTIFY;
2421
2422         err = rt_fill_info(net, dst, src, &fl4, skb,
2423                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2424                            RTM_NEWROUTE, 0, 0);
2425         if (err <= 0)
2426                 goto errout_free;
2427
2428         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2429 errout:
2430         return err;
2431
2432 errout_free:
2433         kfree_skb(skb);
2434         goto errout;
2435 }
2436
2437 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2438 {
2439         return skb->len;
2440 }
2441
2442 void ip_rt_multicast_event(struct in_device *in_dev)
2443 {
2444         rt_cache_flush(dev_net(in_dev->dev));
2445 }
2446
2447 #ifdef CONFIG_SYSCTL
2448 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2449 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2450 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2451 static int ip_rt_gc_elasticity __read_mostly    = 8;
2452
2453 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2454                                         void __user *buffer,
2455                                         size_t *lenp, loff_t *ppos)
2456 {
2457         if (write) {
2458                 rt_cache_flush((struct net *)__ctl->extra1);
2459                 return 0;
2460         }
2461
2462         return -EINVAL;
2463 }
2464
2465 static ctl_table ipv4_route_table[] = {
2466         {
2467                 .procname       = "gc_thresh",
2468                 .data           = &ipv4_dst_ops.gc_thresh,
2469                 .maxlen         = sizeof(int),
2470                 .mode           = 0644,
2471                 .proc_handler   = proc_dointvec,
2472         },
2473         {
2474                 .procname       = "max_size",
2475                 .data           = &ip_rt_max_size,
2476                 .maxlen         = sizeof(int),
2477                 .mode           = 0644,
2478                 .proc_handler   = proc_dointvec,
2479         },
2480         {
2481                 /*  Deprecated. Use gc_min_interval_ms */
2482
2483                 .procname       = "gc_min_interval",
2484                 .data           = &ip_rt_gc_min_interval,
2485                 .maxlen         = sizeof(int),
2486                 .mode           = 0644,
2487                 .proc_handler   = proc_dointvec_jiffies,
2488         },
2489         {
2490                 .procname       = "gc_min_interval_ms",
2491                 .data           = &ip_rt_gc_min_interval,
2492                 .maxlen         = sizeof(int),
2493                 .mode           = 0644,
2494                 .proc_handler   = proc_dointvec_ms_jiffies,
2495         },
2496         {
2497                 .procname       = "gc_timeout",
2498                 .data           = &ip_rt_gc_timeout,
2499                 .maxlen         = sizeof(int),
2500                 .mode           = 0644,
2501                 .proc_handler   = proc_dointvec_jiffies,
2502         },
2503         {
2504                 .procname       = "gc_interval",
2505                 .data           = &ip_rt_gc_interval,
2506                 .maxlen         = sizeof(int),
2507                 .mode           = 0644,
2508                 .proc_handler   = proc_dointvec_jiffies,
2509         },
2510         {
2511                 .procname       = "redirect_load",
2512                 .data           = &ip_rt_redirect_load,
2513                 .maxlen         = sizeof(int),
2514                 .mode           = 0644,
2515                 .proc_handler   = proc_dointvec,
2516         },
2517         {
2518                 .procname       = "redirect_number",
2519                 .data           = &ip_rt_redirect_number,
2520                 .maxlen         = sizeof(int),
2521                 .mode           = 0644,
2522                 .proc_handler   = proc_dointvec,
2523         },
2524         {
2525                 .procname       = "redirect_silence",
2526                 .data           = &ip_rt_redirect_silence,
2527                 .maxlen         = sizeof(int),
2528                 .mode           = 0644,
2529                 .proc_handler   = proc_dointvec,
2530         },
2531         {
2532                 .procname       = "error_cost",
2533                 .data           = &ip_rt_error_cost,
2534                 .maxlen         = sizeof(int),
2535                 .mode           = 0644,
2536                 .proc_handler   = proc_dointvec,
2537         },
2538         {
2539                 .procname       = "error_burst",
2540                 .data           = &ip_rt_error_burst,
2541                 .maxlen         = sizeof(int),
2542                 .mode           = 0644,
2543                 .proc_handler   = proc_dointvec,
2544         },
2545         {
2546                 .procname       = "gc_elasticity",
2547                 .data           = &ip_rt_gc_elasticity,
2548                 .maxlen         = sizeof(int),
2549                 .mode           = 0644,
2550                 .proc_handler   = proc_dointvec,
2551         },
2552         {
2553                 .procname       = "mtu_expires",
2554                 .data           = &ip_rt_mtu_expires,
2555                 .maxlen         = sizeof(int),
2556                 .mode           = 0644,
2557                 .proc_handler   = proc_dointvec_jiffies,
2558         },
2559         {
2560                 .procname       = "min_pmtu",
2561                 .data           = &ip_rt_min_pmtu,
2562                 .maxlen         = sizeof(int),
2563                 .mode           = 0644,
2564                 .proc_handler   = proc_dointvec,
2565         },
2566         {
2567                 .procname       = "min_adv_mss",
2568                 .data           = &ip_rt_min_advmss,
2569                 .maxlen         = sizeof(int),
2570                 .mode           = 0644,
2571                 .proc_handler   = proc_dointvec,
2572         },
2573         { }
2574 };
2575
2576 static struct ctl_table ipv4_route_flush_table[] = {
2577         {
2578                 .procname       = "flush",
2579                 .maxlen         = sizeof(int),
2580                 .mode           = 0200,
2581                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2582         },
2583         { },
2584 };
2585
2586 static __net_init int sysctl_route_net_init(struct net *net)
2587 {
2588         struct ctl_table *tbl;
2589
2590         tbl = ipv4_route_flush_table;
2591         if (!net_eq(net, &init_net)) {
2592                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2593                 if (tbl == NULL)
2594                         goto err_dup;
2595
2596                 /* Don't export sysctls to unprivileged users */
2597                 if (net->user_ns != &init_user_ns)
2598                         tbl[0].procname = NULL;
2599         }
2600         tbl[0].extra1 = net;
2601
2602         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2603         if (net->ipv4.route_hdr == NULL)
2604                 goto err_reg;
2605         return 0;
2606
2607 err_reg:
2608         if (tbl != ipv4_route_flush_table)
2609                 kfree(tbl);
2610 err_dup:
2611         return -ENOMEM;
2612 }
2613
2614 static __net_exit void sysctl_route_net_exit(struct net *net)
2615 {
2616         struct ctl_table *tbl;
2617
2618         tbl = net->ipv4.route_hdr->ctl_table_arg;
2619         unregister_net_sysctl_table(net->ipv4.route_hdr);
2620         BUG_ON(tbl == ipv4_route_flush_table);
2621         kfree(tbl);
2622 }
2623
2624 static __net_initdata struct pernet_operations sysctl_route_ops = {
2625         .init = sysctl_route_net_init,
2626         .exit = sysctl_route_net_exit,
2627 };
2628 #endif
2629
2630 static __net_init int rt_genid_init(struct net *net)
2631 {
2632         atomic_set(&net->rt_genid, 0);
2633         get_random_bytes(&net->ipv4.dev_addr_genid,
2634                          sizeof(net->ipv4.dev_addr_genid));
2635         return 0;
2636 }
2637
2638 static __net_initdata struct pernet_operations rt_genid_ops = {
2639         .init = rt_genid_init,
2640 };
2641
2642 static int __net_init ipv4_inetpeer_init(struct net *net)
2643 {
2644         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2645
2646         if (!bp)
2647                 return -ENOMEM;
2648         inet_peer_base_init(bp);
2649         net->ipv4.peers = bp;
2650         return 0;
2651 }
2652
2653 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2654 {
2655         struct inet_peer_base *bp = net->ipv4.peers;
2656
2657         net->ipv4.peers = NULL;
2658         inetpeer_invalidate_tree(bp);
2659         kfree(bp);
2660 }
2661
2662 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2663         .init   =       ipv4_inetpeer_init,
2664         .exit   =       ipv4_inetpeer_exit,
2665 };
2666
2667 #ifdef CONFIG_IP_ROUTE_CLASSID
2668 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2669 #endif /* CONFIG_IP_ROUTE_CLASSID */
2670
2671 int __init ip_rt_init(void)
2672 {
2673         int rc = 0;
2674
2675         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2676         if (!ip_idents)
2677                 panic("IP: failed to allocate ip_idents\n");
2678
2679         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2680
2681 #ifdef CONFIG_IP_ROUTE_CLASSID
2682         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2683         if (!ip_rt_acct)
2684                 panic("IP: failed to allocate ip_rt_acct\n");
2685 #endif
2686
2687         ipv4_dst_ops.kmem_cachep =
2688                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2689                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2690
2691         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2692
2693         if (dst_entries_init(&ipv4_dst_ops) < 0)
2694                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2695
2696         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2697                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2698
2699         ipv4_dst_ops.gc_thresh = ~0;
2700         ip_rt_max_size = INT_MAX;
2701
2702         devinet_init();
2703         ip_fib_init();
2704
2705         if (ip_rt_proc_init())
2706                 pr_err("Unable to create route proc files\n");
2707 #ifdef CONFIG_XFRM
2708         xfrm_init();
2709         xfrm4_init();
2710 #endif
2711         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2712
2713 #ifdef CONFIG_SYSCTL
2714         register_pernet_subsys(&sysctl_route_ops);
2715 #endif
2716         register_pernet_subsys(&rt_genid_ops);
2717         register_pernet_subsys(&ipv4_inetpeer_ops);
2718         return rc;
2719 }
2720
2721 #ifdef CONFIG_SYSCTL
2722 /*
2723  * We really need to sanitize the damn ipv4 init order, then all
2724  * this nonsense will go away.
2725  */
2726 void __init ip_static_sysctl_init(void)
2727 {
2728         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2729 }
2730 #endif