net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <linux/prefetch.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_gc_elasticity __read_mostly    = 8;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134 static int rt_chain_length_max __read_mostly    = 20;
 135
 136 static struct delayed_work expires_work;
 137 static unsigned long expires_ljiffies;
 138
 139 /*
 140  *      Interface to generic destination cache.
 141  */
 142
 143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 144 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 145 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 146 static void              ipv4_dst_destroy(struct dst_entry *dst);
 147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 148 static void              ipv4_link_failure(struct sk_buff *skb);
 149 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 150 static int rt_garbage_collect(struct dst_ops *ops);
 151
 152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 153                             int how)
 154 {
 155 }
 156
 157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 158 {
 159         struct rtable *rt = (struct rtable *) dst;
 160         struct inet_peer *peer;
 161         u32 *p = NULL;
 162
 163         if (!rt->peer)
 164                 rt_bind_peer(rt, rt->rt_dst, 1);
 165
 166         peer = rt->peer;
 167         if (peer) {
 168                 u32 *old_p = __DST_METRICS_PTR(old);
 169                 unsigned long prev, new;
 170
 171                 p = peer->metrics;
 172                 if (inet_metrics_new(peer))
 173                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 174
 175                 new = (unsigned long) p;
 176                 prev = cmpxchg(&dst->_metrics, old, new);
 177
 178                 if (prev != old) {
 179                         p = __DST_METRICS_PTR(prev);
 180                         if (prev & DST_METRICS_READ_ONLY)
 181                                 p = NULL;
 182                 } else {
 183                         if (rt->fi) {
 184                                 fib_info_put(rt->fi);
 185                                 rt->fi = NULL;
 186                         }
 187                 }
 188         }
 189         return p;
 190 }
 191
 192 static struct dst_ops ipv4_dst_ops = {
 193         .family =               AF_INET,
 194         .protocol =             cpu_to_be16(ETH_P_IP),
 195         .gc =                   rt_garbage_collect,
 196         .check =                ipv4_dst_check,
 197         .default_advmss =       ipv4_default_advmss,
 198         .default_mtu =          ipv4_default_mtu,
 199         .cow_metrics =          ipv4_cow_metrics,
 200         .destroy =              ipv4_dst_destroy,
 201         .ifdown =               ipv4_dst_ifdown,
 202         .negative_advice =      ipv4_negative_advice,
 203         .link_failure =         ipv4_link_failure,
 204         .update_pmtu =          ip_rt_update_pmtu,
 205         .local_out =            __ip_local_out,
 206 };
 207
 208 #define ECN_OR_COST(class)      TC_PRIO_##class
 209
 210 const __u8 ip_tos2prio[16] = {
 211         TC_PRIO_BESTEFFORT,
 212         ECN_OR_COST(BESTEFFORT),
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BULK,
 216         ECN_OR_COST(BULK),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_INTERACTIVE,
 220         ECN_OR_COST(INTERACTIVE),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE_BULK,
 224         ECN_OR_COST(INTERACTIVE_BULK),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK)
 227 };
 228
 229
 230 /*
 231  * Route cache.
 232  */
 233
 234 /* The locking scheme is rather straight forward:
 235  *
 236  * 1) Read-Copy Update protects the buckets of the central route hash.
 237  * 2) Only writers remove entries, and they hold the lock
 238  *    as they look at rtable reference counts.
 239  * 3) Only readers acquire references to rtable entries,
 240  *    they do so with atomic increments and with the
 241  *    lock held.
 242  */
 243
 244 struct rt_hash_bucket {
 245         struct rtable __rcu     *chain;
 246 };
 247
 248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 249         defined(CONFIG_PROVE_LOCKING)
 250 /*
 251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 252  * The size of this table is a power of two and depends on the number of CPUS.
 253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 254  */
 255 #ifdef CONFIG_LOCKDEP
 256 # define RT_HASH_LOCK_SZ        256
 257 #else
 258 # if NR_CPUS >= 32
 259 #  define RT_HASH_LOCK_SZ       4096
 260 # elif NR_CPUS >= 16
 261 #  define RT_HASH_LOCK_SZ       2048
 262 # elif NR_CPUS >= 8
 263 #  define RT_HASH_LOCK_SZ       1024
 264 # elif NR_CPUS >= 4
 265 #  define RT_HASH_LOCK_SZ       512
 266 # else
 267 #  define RT_HASH_LOCK_SZ       256
 268 # endif
 269 #endif
 270
 271 static spinlock_t       *rt_hash_locks;
 272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 273
 274 static __init void rt_hash_lock_init(void)
 275 {
 276         int i;
 277
 278         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 279                         GFP_KERNEL);
 280         if (!rt_hash_locks)
 281                 panic("IP: failed to allocate rt_hash_locks\n");
 282
 283         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 284                 spin_lock_init(&rt_hash_locks[i]);
 285 }
 286 #else
 287 # define rt_hash_lock_addr(slot) NULL
 288
 289 static inline void rt_hash_lock_init(void)
 290 {
 291 }
 292 #endif
 293
 294 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 295 static unsigned                 rt_hash_mask __read_mostly;
 296 static unsigned int             rt_hash_log  __read_mostly;
 297
 298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 300
 301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 302                                    int genid)
 303 {
 304         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 305                             idx, genid)
 306                 & rt_hash_mask;
 307 }
 308
 309 static inline int rt_genid(struct net *net)
 310 {
 311         return atomic_read(&net->ipv4.rt_genid);
 312 }
 313
 314 #ifdef CONFIG_PROC_FS
 315 struct rt_cache_iter_state {
 316         struct seq_net_private p;
 317         int bucket;
 318         int genid;
 319 };
 320
 321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         struct rtable *r = NULL;
 325
 326         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 327                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 328                         continue;
 329                 rcu_read_lock_bh();
 330                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 331                 while (r) {
 332                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 333                             r->rt_genid == st->genid)
 334                                 return r;
 335                         r = rcu_dereference_bh(r->dst.rt_next);
 336                 }
 337                 rcu_read_unlock_bh();
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 343                                           struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346
 347         r = rcu_dereference_bh(r->dst.rt_next);
 348         while (!r) {
 349                 rcu_read_unlock_bh();
 350                 do {
 351                         if (--st->bucket < 0)
 352                                 return NULL;
 353                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 354                 rcu_read_lock_bh();
 355                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 356         }
 357         return r;
 358 }
 359
 360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 361                                         struct rtable *r)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 365                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 366                         continue;
 367                 if (r->rt_genid == st->genid)
 368                         break;
 369         }
 370         return r;
 371 }
 372
 373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 374 {
 375         struct rtable *r = rt_cache_get_first(seq);
 376
 377         if (r)
 378                 while (pos && (r = rt_cache_get_next(seq, r)))
 379                         --pos;
 380         return pos ? NULL : r;
 381 }
 382
 383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 384 {
 385         struct rt_cache_iter_state *st = seq->private;
 386         if (*pos)
 387                 return rt_cache_get_idx(seq, *pos - 1);
 388         st->genid = rt_genid(seq_file_net(seq));
 389         return SEQ_START_TOKEN;
 390 }
 391
 392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 393 {
 394         struct rtable *r;
 395
 396         if (v == SEQ_START_TOKEN)
 397                 r = rt_cache_get_first(seq);
 398         else
 399                 r = rt_cache_get_next(seq, v);
 400         ++*pos;
 401         return r;
 402 }
 403
 404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 405 {
 406         if (v && v != SEQ_START_TOKEN)
 407                 rcu_read_unlock_bh();
 408 }
 409
 410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 411 {
 412         if (v == SEQ_START_TOKEN)
 413                 seq_printf(seq, "%-127s\n",
 414                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 415                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 416                            "HHUptod\tSpecDst");
 417         else {
 418                 struct rtable *r = v;
 419                 struct neighbour *n;
 420                 int len, HHUptod;
 421
 422                 rcu_read_lock();
 423                 n = dst_get_neighbour(&r->dst);
 424                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 425                 rcu_read_unlock();
 426
 427                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 428                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 429                         r->dst.dev ? r->dst.dev->name : "*",
 430                         (__force u32)r->rt_dst,
 431                         (__force u32)r->rt_gateway,
 432                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 433                         r->dst.__use, 0, (__force u32)r->rt_src,
 434                         dst_metric_advmss(&r->dst) + 40,
 435                         dst_metric(&r->dst, RTAX_WINDOW),
 436                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 437                               dst_metric(&r->dst, RTAX_RTTVAR)),
 438                         r->rt_key_tos,
 439                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
 440                         HHUptod,
 441                         r->rt_spec_dst, &len);
 442
 443                 seq_printf(seq, "%*s\n", 127 - len, "");
 444         }
 445         return 0;
 446 }
 447
 448 static const struct seq_operations rt_cache_seq_ops = {
 449         .start  = rt_cache_seq_start,
 450         .next   = rt_cache_seq_next,
 451         .stop   = rt_cache_seq_stop,
 452         .show   = rt_cache_seq_show,
 453 };
 454
 455 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 456 {
 457         return seq_open_net(inode, file, &rt_cache_seq_ops,
 458                         sizeof(struct rt_cache_iter_state));
 459 }
 460
 461 static const struct file_operations rt_cache_seq_fops = {
 462         .owner   = THIS_MODULE,
 463         .open    = rt_cache_seq_open,
 464         .read    = seq_read,
 465         .llseek  = seq_lseek,
 466         .release = seq_release_net,
 467 };
 468
 469
 470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 471 {
 472         int cpu;
 473
 474         if (*pos == 0)
 475                 return SEQ_START_TOKEN;
 476
 477         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 478                 if (!cpu_possible(cpu))
 479                         continue;
 480                 *pos = cpu+1;
 481                 return &per_cpu(rt_cache_stat, cpu);
 482         }
 483         return NULL;
 484 }
 485
 486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 487 {
 488         int cpu;
 489
 490         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 491                 if (!cpu_possible(cpu))
 492                         continue;
 493                 *pos = cpu+1;
 494                 return &per_cpu(rt_cache_stat, cpu);
 495         }
 496         return NULL;
 497
 498 }
 499
 500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 501 {
 502
 503 }
 504
 505 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 506 {
 507         struct rt_cache_stat *st = v;
 508
 509         if (v == SEQ_START_TOKEN) {
 510                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 511                 return 0;
 512         }
 513
 514         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 515                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 516                    dst_entries_get_slow(&ipv4_dst_ops),
 517                    st->in_hit,
 518                    st->in_slow_tot,
 519                    st->in_slow_mc,
 520                    st->in_no_route,
 521                    st->in_brd,
 522                    st->in_martian_dst,
 523                    st->in_martian_src,
 524
 525                    st->out_hit,
 526                    st->out_slow_tot,
 527                    st->out_slow_mc,
 528
 529                    st->gc_total,
 530                    st->gc_ignored,
 531                    st->gc_goal_miss,
 532                    st->gc_dst_overflow,
 533                    st->in_hlist_search,
 534                    st->out_hlist_search
 535                 );
 536         return 0;
 537 }
 538
 539 static const struct seq_operations rt_cpu_seq_ops = {
 540         .start  = rt_cpu_seq_start,
 541         .next   = rt_cpu_seq_next,
 542         .stop   = rt_cpu_seq_stop,
 543         .show   = rt_cpu_seq_show,
 544 };
 545
 546
 547 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 548 {
 549         return seq_open(file, &rt_cpu_seq_ops);
 550 }
 551
 552 static const struct file_operations rt_cpu_seq_fops = {
 553         .owner   = THIS_MODULE,
 554         .open    = rt_cpu_seq_open,
 555         .read    = seq_read,
 556         .llseek  = seq_lseek,
 557         .release = seq_release,
 558 };
 559
 560 #ifdef CONFIG_IP_ROUTE_CLASSID
 561 static int rt_acct_proc_show(struct seq_file *m, void *v)
 562 {
 563         struct ip_rt_acct *dst, *src;
 564         unsigned int i, j;
 565
 566         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 567         if (!dst)
 568                 return -ENOMEM;
 569
 570         for_each_possible_cpu(i) {
 571                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 572                 for (j = 0; j < 256; j++) {
 573                         dst[j].o_bytes   += src[j].o_bytes;
 574                         dst[j].o_packets += src[j].o_packets;
 575                         dst[j].i_bytes   += src[j].i_bytes;
 576                         dst[j].i_packets += src[j].i_packets;
 577                 }
 578         }
 579
 580         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 581         kfree(dst);
 582         return 0;
 583 }
 584
 585 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 586 {
 587         return single_open(file, rt_acct_proc_show, NULL);
 588 }
 589
 590 static const struct file_operations rt_acct_proc_fops = {
 591         .owner          = THIS_MODULE,
 592         .open           = rt_acct_proc_open,
 593         .read           = seq_read,
 594         .llseek         = seq_lseek,
 595         .release        = single_release,
 596 };
 597 #endif
 598
 599 static int __net_init ip_rt_do_proc_init(struct net *net)
 600 {
 601         struct proc_dir_entry *pde;
 602
 603         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 604                         &rt_cache_seq_fops);
 605         if (!pde)
 606                 goto err1;
 607
 608         pde = proc_create("rt_cache", S_IRUGO,
 609                           net->proc_net_stat, &rt_cpu_seq_fops);
 610         if (!pde)
 611                 goto err2;
 612
 613 #ifdef CONFIG_IP_ROUTE_CLASSID
 614         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 615         if (!pde)
 616                 goto err3;
 617 #endif
 618         return 0;
 619
 620 #ifdef CONFIG_IP_ROUTE_CLASSID
 621 err3:
 622         remove_proc_entry("rt_cache", net->proc_net_stat);
 623 #endif
 624 err2:
 625         remove_proc_entry("rt_cache", net->proc_net);
 626 err1:
 627         return -ENOMEM;
 628 }
 629
 630 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 631 {
 632         remove_proc_entry("rt_cache", net->proc_net_stat);
 633         remove_proc_entry("rt_cache", net->proc_net);
 634 #ifdef CONFIG_IP_ROUTE_CLASSID
 635         remove_proc_entry("rt_acct", net->proc_net);
 636 #endif
 637 }
 638
 639 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 640         .init = ip_rt_do_proc_init,
 641         .exit = ip_rt_do_proc_exit,
 642 };
 643
 644 static int __init ip_rt_proc_init(void)
 645 {
 646         return register_pernet_subsys(&ip_rt_proc_ops);
 647 }
 648
 649 #else
 650 static inline int ip_rt_proc_init(void)
 651 {
 652         return 0;
 653 }
 654 #endif /* CONFIG_PROC_FS */
 655
 656 static inline void rt_free(struct rtable *rt)
 657 {
 658         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 659 }
 660
 661 static inline void rt_drop(struct rtable *rt)
 662 {
 663         ip_rt_put(rt);
 664         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 665 }
 666
 667 static inline int rt_fast_clean(struct rtable *rth)
 668 {
 669         /* Kill broadcast/multicast entries very aggresively, if they
 670            collide in hash table with more useful entries */
 671         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 672                 rt_is_input_route(rth) && rth->dst.rt_next;
 673 }
 674
 675 static inline int rt_valuable(struct rtable *rth)
 676 {
 677         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 678                 (rth->peer && rth->peer->pmtu_expires);
 679 }
 680
 681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 682 {
 683         unsigned long age;
 684         int ret = 0;
 685
 686         if (atomic_read(&rth->dst.__refcnt))
 687                 goto out;
 688
 689         age = jiffies - rth->dst.lastuse;
 690         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 691             (age <= tmo2 && rt_valuable(rth)))
 692                 goto out;
 693         ret = 1;
 694 out:    return ret;
 695 }
 696
 697 /* Bits of score are:
 698  * 31: very valuable
 699  * 30: not quite useless
 700  * 29..0: usage counter
 701  */
 702 static inline u32 rt_score(struct rtable *rt)
 703 {
 704         u32 score = jiffies - rt->dst.lastuse;
 705
 706         score = ~score & ~(3<<30);
 707
 708         if (rt_valuable(rt))
 709                 score |= (1<<31);
 710
 711         if (rt_is_output_route(rt) ||
 712             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 713                 score |= (1<<30);
 714
 715         return score;
 716 }
 717
 718 static inline bool rt_caching(const struct net *net)
 719 {
 720         return net->ipv4.current_rt_cache_rebuild_count <=
 721                 net->ipv4.sysctl_rt_cache_rebuild_count;
 722 }
 723
 724 static inline bool compare_hash_inputs(const struct rtable *rt1,
 725                                        const struct rtable *rt2)
 726 {
 727         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 728                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 729                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 730 }
 731
 732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 733 {
 734         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 735                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 736                 (rt1->rt_mark ^ rt2->rt_mark) |
 737                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 738                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 739                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 740 }
 741
 742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 743 {
 744         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 745 }
 746
 747 static inline int rt_is_expired(struct rtable *rth)
 748 {
 749         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 750 }
 751
 752 /*
 753  * Perform a full scan of hash table and free all entries.
 754  * Can be called by a softirq or a process.
 755  * In the later case, we want to be reschedule if necessary
 756  */
 757 static void rt_do_flush(struct net *net, int process_context)
 758 {
 759         unsigned int i;
 760         struct rtable *rth, *next;
 761
 762         for (i = 0; i <= rt_hash_mask; i++) {
 763                 struct rtable __rcu **pprev;
 764                 struct rtable *list;
 765
 766                 if (process_context && need_resched())
 767                         cond_resched();
 768                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 769                 if (!rth)
 770                         continue;
 771
 772                 spin_lock_bh(rt_hash_lock_addr(i));
 773
 774                 list = NULL;
 775                 pprev = &rt_hash_table[i].chain;
 776                 rth = rcu_dereference_protected(*pprev,
 777                         lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779                 while (rth) {
 780                         next = rcu_dereference_protected(rth->dst.rt_next,
 781                                 lockdep_is_held(rt_hash_lock_addr(i)));
 782
 783                         if (!net ||
 784                             net_eq(dev_net(rth->dst.dev), net)) {
 785                                 rcu_assign_pointer(*pprev, next);
 786                                 rcu_assign_pointer(rth->dst.rt_next, list);
 787                                 list = rth;
 788                         } else {
 789                                 pprev = &rth->dst.rt_next;
 790                         }
 791                         rth = next;
 792                 }
 793
 794                 spin_unlock_bh(rt_hash_lock_addr(i));
 795
 796                 for (; list; list = next) {
 797                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 798                         rt_free(list);
 799                 }
 800         }
 801 }
 802
 803 /*
 804  * While freeing expired entries, we compute average chain length
 805  * and standard deviation, using fixed-point arithmetic.
 806  * This to have an estimation of rt_chain_length_max
 807  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 808  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 809  */
 810
 811 #define FRACT_BITS 3
 812 #define ONE (1UL << FRACT_BITS)
 813
 814 /*
 815  * Given a hash chain and an item in this hash chain,
 816  * find if a previous entry has the same hash_inputs
 817  * (but differs on tos, mark or oif)
 818  * Returns 0 if an alias is found.
 819  * Returns ONE if rth has no alias before itself.
 820  */
 821 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 822 {
 823         const struct rtable *aux = head;
 824
 825         while (aux != rth) {
 826                 if (compare_hash_inputs(aux, rth))
 827                         return 0;
 828                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 829         }
 830         return ONE;
 831 }
 832
 833 static void rt_check_expire(void)
 834 {
 835         static unsigned int rover;
 836         unsigned int i = rover, goal;
 837         struct rtable *rth;
 838         struct rtable __rcu **rthp;
 839         unsigned long samples = 0;
 840         unsigned long sum = 0, sum2 = 0;
 841         unsigned long delta;
 842         u64 mult;
 843
 844         delta = jiffies - expires_ljiffies;
 845         expires_ljiffies = jiffies;
 846         mult = ((u64)delta) << rt_hash_log;
 847         if (ip_rt_gc_timeout > 1)
 848                 do_div(mult, ip_rt_gc_timeout);
 849         goal = (unsigned int)mult;
 850         if (goal > rt_hash_mask)
 851                 goal = rt_hash_mask + 1;
 852         for (; goal > 0; goal--) {
 853                 unsigned long tmo = ip_rt_gc_timeout;
 854                 unsigned long length;
 855
 856                 i = (i + 1) & rt_hash_mask;
 857                 rthp = &rt_hash_table[i].chain;
 858
 859                 if (need_resched())
 860                         cond_resched();
 861
 862                 samples++;
 863
 864                 if (rcu_dereference_raw(*rthp) == NULL)
 865                         continue;
 866                 length = 0;
 867                 spin_lock_bh(rt_hash_lock_addr(i));
 868                 while ((rth = rcu_dereference_protected(*rthp,
 869                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 870                         prefetch(rth->dst.rt_next);
 871                         if (rt_is_expired(rth)) {
 872                                 *rthp = rth->dst.rt_next;
 873                                 rt_free(rth);
 874                                 continue;
 875                         }
 876                         if (rth->dst.expires) {
 877                                 /* Entry is expired even if it is in use */
 878                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 879 nofree:
 880                                         tmo >>= 1;
 881                                         rthp = &rth->dst.rt_next;
 882                                         /*
 883                                          * We only count entries on
 884                                          * a chain with equal hash inputs once
 885                                          * so that entries for different QOS
 886                                          * levels, and other non-hash input
 887                                          * attributes don't unfairly skew
 888                                          * the length computation
 889                                          */
 890                                         length += has_noalias(rt_hash_table[i].chain, rth);
 891                                         continue;
 892                                 }
 893                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 894                                 goto nofree;
 895
 896                         /* Cleanup aged off entries. */
 897                         *rthp = rth->dst.rt_next;
 898                         rt_free(rth);
 899                 }
 900                 spin_unlock_bh(rt_hash_lock_addr(i));
 901                 sum += length;
 902                 sum2 += length*length;
 903         }
 904         if (samples) {
 905                 unsigned long avg = sum / samples;
 906                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 907                 rt_chain_length_max = max_t(unsigned long,
 908                                         ip_rt_gc_elasticity,
 909                                         (avg + 4*sd) >> FRACT_BITS);
 910         }
 911         rover = i;
 912 }
 913
 914 /*
 915  * rt_worker_func() is run in process context.
 916  * we call rt_check_expire() to scan part of the hash table
 917  */
 918 static void rt_worker_func(struct work_struct *work)
 919 {
 920         rt_check_expire();
 921         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 922 }
 923
 924 /*
 925  * Perturbation of rt_genid by a small quantity [1..256]
 926  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 927  * many times (2^24) without giving recent rt_genid.
 928  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 929  */
 930 static void rt_cache_invalidate(struct net *net)
 931 {
 932         unsigned char shuffle;
 933
 934         get_random_bytes(&shuffle, sizeof(shuffle));
 935         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 936 }
 937
 938 /*
 939  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 940  * delay >= 0 : invalidate & flush cache (can be long)
 941  */
 942 void rt_cache_flush(struct net *net, int delay)
 943 {
 944         rt_cache_invalidate(net);
 945         if (delay >= 0)
 946                 rt_do_flush(net, !in_softirq());
 947 }
 948
 949 /* Flush previous cache invalidated entries from the cache */
 950 void rt_cache_flush_batch(struct net *net)
 951 {
 952         rt_do_flush(net, !in_softirq());
 953 }
 954
 955 static void rt_emergency_hash_rebuild(struct net *net)
 956 {
 957         if (net_ratelimit())
 958                 printk(KERN_WARNING "Route hash chain too long!\n");
 959         rt_cache_invalidate(net);
 960 }
 961
 962 /*
 963    Short description of GC goals.
 964
 965    We want to build algorithm, which will keep routing cache
 966    at some equilibrium point, when number of aged off entries
 967    is kept approximately equal to newly generated ones.
 968
 969    Current expiration strength is variable "expire".
 970    We try to adjust it dynamically, so that if networking
 971    is idle expires is large enough to keep enough of warm entries,
 972    and when load increases it reduces to limit cache size.
 973  */
 974
 975 static int rt_garbage_collect(struct dst_ops *ops)
 976 {
 977         static unsigned long expire = RT_GC_TIMEOUT;
 978         static unsigned long last_gc;
 979         static int rover;
 980         static int equilibrium;
 981         struct rtable *rth;
 982         struct rtable __rcu **rthp;
 983         unsigned long now = jiffies;
 984         int goal;
 985         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 986
 987         /*
 988          * Garbage collection is pretty expensive,
 989          * do not make it too frequently.
 990          */
 991
 992         RT_CACHE_STAT_INC(gc_total);
 993
 994         if (now - last_gc < ip_rt_gc_min_interval &&
 995             entries < ip_rt_max_size) {
 996                 RT_CACHE_STAT_INC(gc_ignored);
 997                 goto out;
 998         }
 999
1000         entries = dst_entries_get_slow(&ipv4_dst_ops);
1001         /* Calculate number of entries, which we want to expire now. */
1002         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1003         if (goal <= 0) {
1004                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1005                         equilibrium = ipv4_dst_ops.gc_thresh;
1006                 goal = entries - equilibrium;
1007                 if (goal > 0) {
1008                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1009                         goal = entries - equilibrium;
1010                 }
1011         } else {
1012                 /* We are in dangerous area. Try to reduce cache really
1013                  * aggressively.
1014                  */
1015                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1016                 equilibrium = entries - goal;
1017         }
1018
1019         if (now - last_gc >= ip_rt_gc_min_interval)
1020                 last_gc = now;
1021
1022         if (goal <= 0) {
1023                 equilibrium += goal;
1024                 goto work_done;
1025         }
1026
1027         do {
1028                 int i, k;
1029
1030                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1031                         unsigned long tmo = expire;
1032
1033                         k = (k + 1) & rt_hash_mask;
1034                         rthp = &rt_hash_table[k].chain;
1035                         spin_lock_bh(rt_hash_lock_addr(k));
1036                         while ((rth = rcu_dereference_protected(*rthp,
1037                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1038                                 if (!rt_is_expired(rth) &&
1039                                         !rt_may_expire(rth, tmo, expire)) {
1040                                         tmo >>= 1;
1041                                         rthp = &rth->dst.rt_next;
1042                                         continue;
1043                                 }
1044                                 *rthp = rth->dst.rt_next;
1045                                 rt_free(rth);
1046                                 goal--;
1047                         }
1048                         spin_unlock_bh(rt_hash_lock_addr(k));
1049                         if (goal <= 0)
1050                                 break;
1051                 }
1052                 rover = k;
1053
1054                 if (goal <= 0)
1055                         goto work_done;
1056
1057                 /* Goal is not achieved. We stop process if:
1058
1059                    - if expire reduced to zero. Otherwise, expire is halfed.
1060                    - if table is not full.
1061                    - if we are called from interrupt.
1062                    - jiffies check is just fallback/debug loop breaker.
1063                      We will not spin here for long time in any case.
1064                  */
1065
1066                 RT_CACHE_STAT_INC(gc_goal_miss);
1067
1068                 if (expire == 0)
1069                         break;
1070
1071                 expire >>= 1;
1072
1073                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1074                         goto out;
1075         } while (!in_softirq() && time_before_eq(jiffies, now));
1076
1077         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1078                 goto out;
1079         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1080                 goto out;
1081         if (net_ratelimit())
1082                 printk(KERN_WARNING "dst cache overflow\n");
1083         RT_CACHE_STAT_INC(gc_dst_overflow);
1084         return 1;
1085
1086 work_done:
1087         expire += ip_rt_gc_min_interval;
1088         if (expire > ip_rt_gc_timeout ||
1089             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1090             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1091                 expire = ip_rt_gc_timeout;
1092 out:    return 0;
1093 }
1094
1095 /*
1096  * Returns number of entries in a hash chain that have different hash_inputs
1097  */
1098 static int slow_chain_length(const struct rtable *head)
1099 {
1100         int length = 0;
1101         const struct rtable *rth = head;
1102
1103         while (rth) {
1104                 length += has_noalias(head, rth);
1105                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1106         }
1107         return length >> FRACT_BITS;
1108 }
1109
1110 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1111                                      struct sk_buff *skb, int ifindex)
1112 {
1113         struct rtable   *rth, *cand;
1114         struct rtable __rcu **rthp, **candp;
1115         unsigned long   now;
1116         u32             min_score;
1117         int             chain_length;
1118         int attempts = !in_softirq();
1119
1120 restart:
1121         chain_length = 0;
1122         min_score = ~(u32)0;
1123         cand = NULL;
1124         candp = NULL;
1125         now = jiffies;
1126
1127         if (!rt_caching(dev_net(rt->dst.dev))) {
1128                 /*
1129                  * If we're not caching, just tell the caller we
1130                  * were successful and don't touch the route.  The
1131                  * caller hold the sole reference to the cache entry, and
1132                  * it will be released when the caller is done with it.
1133                  * If we drop it here, the callers have no way to resolve routes
1134                  * when we're not caching.  Instead, just point *rp at rt, so
1135                  * the caller gets a single use out of the route
1136                  * Note that we do rt_free on this new route entry, so that
1137                  * once its refcount hits zero, we are still able to reap it
1138                  * (Thanks Alexey)
1139                  * Note: To avoid expensive rcu stuff for this uncached dst,
1140                  * we set DST_NOCACHE so that dst_release() can free dst without
1141                  * waiting a grace period.
1142                  */
1143
1144                 rt->dst.flags |= DST_NOCACHE;
1145                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1146                         int err = arp_bind_neighbour(&rt->dst);
1147                         if (err) {
1148                                 if (net_ratelimit())
1149                                         printk(KERN_WARNING
1150                                             "Neighbour table failure & not caching routes.\n");
1151                                 ip_rt_put(rt);
1152                                 return ERR_PTR(err);
1153                         }
1154                 }
1155
1156                 goto skip_hashing;
1157         }
1158
1159         rthp = &rt_hash_table[hash].chain;
1160
1161         spin_lock_bh(rt_hash_lock_addr(hash));
1162         while ((rth = rcu_dereference_protected(*rthp,
1163                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1164                 if (rt_is_expired(rth)) {
1165                         *rthp = rth->dst.rt_next;
1166                         rt_free(rth);
1167                         continue;
1168                 }
1169                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1170                         /* Put it first */
1171                         *rthp = rth->dst.rt_next;
1172                         /*
1173                          * Since lookup is lockfree, the deletion
1174                          * must be visible to another weakly ordered CPU before
1175                          * the insertion at the start of the hash chain.
1176                          */
1177                         rcu_assign_pointer(rth->dst.rt_next,
1178                                            rt_hash_table[hash].chain);
1179                         /*
1180                          * Since lookup is lockfree, the update writes
1181                          * must be ordered for consistency on SMP.
1182                          */
1183                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1184
1185                         dst_use(&rth->dst, now);
1186                         spin_unlock_bh(rt_hash_lock_addr(hash));
1187
1188                         rt_drop(rt);
1189                         if (skb)
1190                                 skb_dst_set(skb, &rth->dst);
1191                         return rth;
1192                 }
1193
1194                 if (!atomic_read(&rth->dst.__refcnt)) {
1195                         u32 score = rt_score(rth);
1196
1197                         if (score <= min_score) {
1198                                 cand = rth;
1199                                 candp = rthp;
1200                                 min_score = score;
1201                         }
1202                 }
1203
1204                 chain_length++;
1205
1206                 rthp = &rth->dst.rt_next;
1207         }
1208
1209         if (cand) {
1210                 /* ip_rt_gc_elasticity used to be average length of chain
1211                  * length, when exceeded gc becomes really aggressive.
1212                  *
1213                  * The second limit is less certain. At the moment it allows
1214                  * only 2 entries per bucket. We will see.
1215                  */
1216                 if (chain_length > ip_rt_gc_elasticity) {
1217                         *candp = cand->dst.rt_next;
1218                         rt_free(cand);
1219                 }
1220         } else {
1221                 if (chain_length > rt_chain_length_max &&
1222                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1223                         struct net *net = dev_net(rt->dst.dev);
1224                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1225                         if (!rt_caching(net)) {
1226                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1227                                         rt->dst.dev->name, num);
1228                         }
1229                         rt_emergency_hash_rebuild(net);
1230                         spin_unlock_bh(rt_hash_lock_addr(hash));
1231
1232                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1233                                         ifindex, rt_genid(net));
1234                         goto restart;
1235                 }
1236         }
1237
1238         /* Try to bind route to arp only if it is output
1239            route or unicast forwarding path.
1240          */
1241         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1242                 int err = arp_bind_neighbour(&rt->dst);
1243                 if (err) {
1244                         spin_unlock_bh(rt_hash_lock_addr(hash));
1245
1246                         if (err != -ENOBUFS) {
1247                                 rt_drop(rt);
1248                                 return ERR_PTR(err);
1249                         }
1250
1251                         /* Neighbour tables are full and nothing
1252                            can be released. Try to shrink route cache,
1253                            it is most likely it holds some neighbour records.
1254                          */
1255                         if (attempts-- > 0) {
1256                                 int saved_elasticity = ip_rt_gc_elasticity;
1257                                 int saved_int = ip_rt_gc_min_interval;
1258                                 ip_rt_gc_elasticity     = 1;
1259                                 ip_rt_gc_min_interval   = 0;
1260                                 rt_garbage_collect(&ipv4_dst_ops);
1261                                 ip_rt_gc_min_interval   = saved_int;
1262                                 ip_rt_gc_elasticity     = saved_elasticity;
1263                                 goto restart;
1264                         }
1265
1266                         if (net_ratelimit())
1267                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1268                         rt_drop(rt);
1269                         return ERR_PTR(-ENOBUFS);
1270                 }
1271         }
1272
1273         rt->dst.rt_next = rt_hash_table[hash].chain;
1274
1275         /*
1276          * Since lookup is lockfree, we must make sure
1277          * previous writes to rt are committed to memory
1278          * before making rt visible to other CPUS.
1279          */
1280         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1281
1282         spin_unlock_bh(rt_hash_lock_addr(hash));
1283
1284 skip_hashing:
1285         if (skb)
1286                 skb_dst_set(skb, &rt->dst);
1287         return rt;
1288 }
1289
1290 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1291
1292 static u32 rt_peer_genid(void)
1293 {
1294         return atomic_read(&__rt_peer_genid);
1295 }
1296
1297 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1298 {
1299         struct inet_peer *peer;
1300
1301         peer = inet_getpeer_v4(daddr, create);
1302
1303         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1304                 inet_putpeer(peer);
1305         else
1306                 rt->rt_peer_genid = rt_peer_genid();
1307 }
1308
1309 /*
1310  * Peer allocation may fail only in serious out-of-memory conditions.  However
1311  * we still can generate some output.
1312  * Random ID selection looks a bit dangerous because we have no chances to
1313  * select ID being unique in a reasonable period of time.
1314  * But broken packet identifier may be better than no packet at all.
1315  */
1316 static void ip_select_fb_ident(struct iphdr *iph)
1317 {
1318         static DEFINE_SPINLOCK(ip_fb_id_lock);
1319         static u32 ip_fallback_id;
1320         u32 salt;
1321
1322         spin_lock_bh(&ip_fb_id_lock);
1323         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1324         iph->id = htons(salt & 0xFFFF);
1325         ip_fallback_id = salt;
1326         spin_unlock_bh(&ip_fb_id_lock);
1327 }
1328
1329 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1330 {
1331         struct rtable *rt = (struct rtable *) dst;
1332
1333         if (rt) {
1334                 if (rt->peer == NULL)
1335                         rt_bind_peer(rt, rt->rt_dst, 1);
1336
1337                 /* If peer is attached to destination, it is never detached,
1338                    so that we need not to grab a lock to dereference it.
1339                  */
1340                 if (rt->peer) {
1341                         iph->id = htons(inet_getid(rt->peer, more));
1342                         return;
1343                 }
1344         } else
1345                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1346                        __builtin_return_address(0));
1347
1348         ip_select_fb_ident(iph);
1349 }
1350 EXPORT_SYMBOL(__ip_select_ident);
1351
1352 static void rt_del(unsigned hash, struct rtable *rt)
1353 {
1354         struct rtable __rcu **rthp;
1355         struct rtable *aux;
1356
1357         rthp = &rt_hash_table[hash].chain;
1358         spin_lock_bh(rt_hash_lock_addr(hash));
1359         ip_rt_put(rt);
1360         while ((aux = rcu_dereference_protected(*rthp,
1361                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1362                 if (aux == rt || rt_is_expired(aux)) {
1363                         *rthp = aux->dst.rt_next;
1364                         rt_free(aux);
1365                         continue;
1366                 }
1367                 rthp = &aux->dst.rt_next;
1368         }
1369         spin_unlock_bh(rt_hash_lock_addr(hash));
1370 }
1371
1372 /* called in rcu_read_lock() section */
1373 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1374                     __be32 saddr, struct net_device *dev)
1375 {
1376         struct in_device *in_dev = __in_dev_get_rcu(dev);
1377         struct inet_peer *peer;
1378         struct net *net;
1379
1380         if (!in_dev)
1381                 return;
1382
1383         net = dev_net(dev);
1384         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1385             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1386             ipv4_is_zeronet(new_gw))
1387                 goto reject_redirect;
1388
1389         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1390                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1391                         goto reject_redirect;
1392                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1393                         goto reject_redirect;
1394         } else {
1395                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1396                         goto reject_redirect;
1397         }
1398
1399         peer = inet_getpeer_v4(daddr, 1);
1400         if (peer) {
1401                 peer->redirect_learned.a4 = new_gw;
1402
1403                 inet_putpeer(peer);
1404
1405                 atomic_inc(&__rt_peer_genid);
1406         }
1407         return;
1408
1409 reject_redirect:
1410 #ifdef CONFIG_IP_ROUTE_VERBOSE
1411         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1412                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1413                         "  Advised path = %pI4 -> %pI4\n",
1414                        &old_gw, dev->name, &new_gw,
1415                        &saddr, &daddr);
1416 #endif
1417         ;
1418 }
1419
1420 static bool peer_pmtu_expired(struct inet_peer *peer)
1421 {
1422         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1423
1424         return orig &&
1425                time_after_eq(jiffies, orig) &&
1426                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1427 }
1428
1429 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1430 {
1431         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1432
1433         return orig &&
1434                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1435 }
1436
1437 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1438 {
1439         struct rtable *rt = (struct rtable *)dst;
1440         struct dst_entry *ret = dst;
1441
1442         if (rt) {
1443                 if (dst->obsolete > 0) {
1444                         ip_rt_put(rt);
1445                         ret = NULL;
1446                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1447                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1448                                                 rt->rt_oif,
1449                                                 rt_genid(dev_net(dst->dev)));
1450                         rt_del(hash, rt);
1451                         ret = NULL;
1452                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1453                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1454                 }
1455         }
1456         return ret;
1457 }
1458
1459 /*
1460  * Algorithm:
1461  *      1. The first ip_rt_redirect_number redirects are sent
1462  *         with exponential backoff, then we stop sending them at all,
1463  *         assuming that the host ignores our redirects.
1464  *      2. If we did not see packets requiring redirects
1465  *         during ip_rt_redirect_silence, we assume that the host
1466  *         forgot redirected route and start to send redirects again.
1467  *
1468  * This algorithm is much cheaper and more intelligent than dumb load limiting
1469  * in icmp.c.
1470  *
1471  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1472  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1473  */
1474
1475 void ip_rt_send_redirect(struct sk_buff *skb)
1476 {
1477         struct rtable *rt = skb_rtable(skb);
1478         struct in_device *in_dev;
1479         struct inet_peer *peer;
1480         int log_martians;
1481
1482         rcu_read_lock();
1483         in_dev = __in_dev_get_rcu(rt->dst.dev);
1484         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1485                 rcu_read_unlock();
1486                 return;
1487         }
1488         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1489         rcu_read_unlock();
1490
1491         if (!rt->peer)
1492                 rt_bind_peer(rt, rt->rt_dst, 1);
1493         peer = rt->peer;
1494         if (!peer) {
1495                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1496                 return;
1497         }
1498
1499         /* No redirected packets during ip_rt_redirect_silence;
1500          * reset the algorithm.
1501          */
1502         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1503                 peer->rate_tokens = 0;
1504
1505         /* Too many ignored redirects; do not send anything
1506          * set dst.rate_last to the last seen redirected packet.
1507          */
1508         if (peer->rate_tokens >= ip_rt_redirect_number) {
1509                 peer->rate_last = jiffies;
1510                 return;
1511         }
1512
1513         /* Check for load limit; set rate_last to the latest sent
1514          * redirect.
1515          */
1516         if (peer->rate_tokens == 0 ||
1517             time_after(jiffies,
1518                        (peer->rate_last +
1519                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1520                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1521                 peer->rate_last = jiffies;
1522                 ++peer->rate_tokens;
1523 #ifdef CONFIG_IP_ROUTE_VERBOSE
1524                 if (log_martians &&
1525                     peer->rate_tokens == ip_rt_redirect_number &&
1526                     net_ratelimit())
1527                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1528                                &ip_hdr(skb)->saddr, rt->rt_iif,
1529                                 &rt->rt_dst, &rt->rt_gateway);
1530 #endif
1531         }
1532 }
1533
1534 static int ip_error(struct sk_buff *skb)
1535 {
1536         struct rtable *rt = skb_rtable(skb);
1537         struct inet_peer *peer;
1538         unsigned long now;
1539         bool send;
1540         int code;
1541
1542         switch (rt->dst.error) {
1543                 case EINVAL:
1544                 default:
1545                         goto out;
1546                 case EHOSTUNREACH:
1547                         code = ICMP_HOST_UNREACH;
1548                         break;
1549                 case ENETUNREACH:
1550                         code = ICMP_NET_UNREACH;
1551                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1552                                         IPSTATS_MIB_INNOROUTES);
1553                         break;
1554                 case EACCES:
1555                         code = ICMP_PKT_FILTERED;
1556                         break;
1557         }
1558
1559         if (!rt->peer)
1560                 rt_bind_peer(rt, rt->rt_dst, 1);
1561         peer = rt->peer;
1562
1563         send = true;
1564         if (peer) {
1565                 now = jiffies;
1566                 peer->rate_tokens += now - peer->rate_last;
1567                 if (peer->rate_tokens > ip_rt_error_burst)
1568                         peer->rate_tokens = ip_rt_error_burst;
1569                 peer->rate_last = now;
1570                 if (peer->rate_tokens >= ip_rt_error_cost)
1571                         peer->rate_tokens -= ip_rt_error_cost;
1572                 else
1573                         send = false;
1574         }
1575         if (send)
1576                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1577
1578 out:    kfree_skb(skb);
1579         return 0;
1580 }
1581
1582 /*
1583  *      The last two values are not from the RFC but
1584  *      are needed for AMPRnet AX.25 paths.
1585  */
1586
1587 static const unsigned short mtu_plateau[] =
1588 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1589
1590 static inline unsigned short guess_mtu(unsigned short old_mtu)
1591 {
1592         int i;
1593
1594         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1595                 if (old_mtu > mtu_plateau[i])
1596                         return mtu_plateau[i];
1597         return 68;
1598 }
1599
1600 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1601                                  unsigned short new_mtu,
1602                                  struct net_device *dev)
1603 {
1604         unsigned short old_mtu = ntohs(iph->tot_len);
1605         unsigned short est_mtu = 0;
1606         struct inet_peer *peer;
1607
1608         peer = inet_getpeer_v4(iph->daddr, 1);
1609         if (peer) {
1610                 unsigned short mtu = new_mtu;
1611
1612                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1613                         /* BSD 4.2 derived systems incorrectly adjust
1614                          * tot_len by the IP header length, and report
1615                          * a zero MTU in the ICMP message.
1616                          */
1617                         if (mtu == 0 &&
1618                             old_mtu >= 68 + (iph->ihl << 2))
1619                                 old_mtu -= iph->ihl << 2;
1620                         mtu = guess_mtu(old_mtu);
1621                 }
1622
1623                 if (mtu < ip_rt_min_pmtu)
1624                         mtu = ip_rt_min_pmtu;
1625                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1626                         unsigned long pmtu_expires;
1627
1628                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1629                         if (!pmtu_expires)
1630                                 pmtu_expires = 1UL;
1631
1632                         est_mtu = mtu;
1633                         peer->pmtu_learned = mtu;
1634                         peer->pmtu_expires = pmtu_expires;
1635                 }
1636
1637                 inet_putpeer(peer);
1638
1639                 atomic_inc(&__rt_peer_genid);
1640         }
1641         return est_mtu ? : new_mtu;
1642 }
1643
1644 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1645 {
1646         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1647
1648         if (!expires)
1649                 return;
1650         if (time_before(jiffies, expires)) {
1651                 u32 orig_dst_mtu = dst_mtu(dst);
1652                 if (peer->pmtu_learned < orig_dst_mtu) {
1653                         if (!peer->pmtu_orig)
1654                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1655                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1656                 }
1657         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1658                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1659 }
1660
1661 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1662 {
1663         struct rtable *rt = (struct rtable *) dst;
1664         struct inet_peer *peer;
1665
1666         dst_confirm(dst);
1667
1668         if (!rt->peer)
1669                 rt_bind_peer(rt, rt->rt_dst, 1);
1670         peer = rt->peer;
1671         if (peer) {
1672                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1673
1674                 if (mtu < ip_rt_min_pmtu)
1675                         mtu = ip_rt_min_pmtu;
1676                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1677
1678                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1679                         if (!pmtu_expires)
1680                                 pmtu_expires = 1UL;
1681
1682                         peer->pmtu_learned = mtu;
1683                         peer->pmtu_expires = pmtu_expires;
1684
1685                         atomic_inc(&__rt_peer_genid);
1686                         rt->rt_peer_genid = rt_peer_genid();
1687                 }
1688                 check_peer_pmtu(dst, peer);
1689         }
1690 }
1691
1692 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1693 {
1694         struct rtable *rt = (struct rtable *) dst;
1695         __be32 orig_gw = rt->rt_gateway;
1696         struct neighbour *n, *old_n;
1697
1698         dst_confirm(&rt->dst);
1699
1700         rt->rt_gateway = peer->redirect_learned.a4;
1701         n = __arp_bind_neighbour(&rt->dst, rt->rt_gateway);
1702         if (IS_ERR(n))
1703                 return PTR_ERR(n);
1704         old_n = xchg(&rt->dst._neighbour, n);
1705         if (old_n)
1706                 neigh_release(old_n);
1707         if (!n || !(n->nud_state & NUD_VALID)) {
1708                 if (n)
1709                         neigh_event_send(n, NULL);
1710                 rt->rt_gateway = orig_gw;
1711                 return -EAGAIN;
1712         } else {
1713                 rt->rt_flags |= RTCF_REDIRECTED;
1714                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1715         }
1716         return 0;
1717 }
1718
1719 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1720 {
1721         struct rtable *rt = (struct rtable *) dst;
1722
1723         if (rt_is_expired(rt))
1724                 return NULL;
1725         if (rt->rt_peer_genid != rt_peer_genid()) {
1726                 struct inet_peer *peer;
1727
1728                 if (!rt->peer)
1729                         rt_bind_peer(rt, rt->rt_dst, 0);
1730
1731                 peer = rt->peer;
1732                 if (peer) {
1733                         check_peer_pmtu(dst, peer);
1734
1735                         if (peer->redirect_learned.a4 &&
1736                             peer->redirect_learned.a4 != rt->rt_gateway) {
1737                                 if (check_peer_redir(dst, peer))
1738                                         return NULL;
1739                         }
1740                 }
1741
1742                 rt->rt_peer_genid = rt_peer_genid();
1743         }
1744         return dst;
1745 }
1746
1747 static void ipv4_dst_destroy(struct dst_entry *dst)
1748 {
1749         struct rtable *rt = (struct rtable *) dst;
1750         struct inet_peer *peer = rt->peer;
1751
1752         if (rt->fi) {
1753                 fib_info_put(rt->fi);
1754                 rt->fi = NULL;
1755         }
1756         if (peer) {
1757                 rt->peer = NULL;
1758                 inet_putpeer(peer);
1759         }
1760 }
1761
1762
1763 static void ipv4_link_failure(struct sk_buff *skb)
1764 {
1765         struct rtable *rt;
1766
1767         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1768
1769         rt = skb_rtable(skb);
1770         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1771                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1772 }
1773
1774 static int ip_rt_bug(struct sk_buff *skb)
1775 {
1776         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1777                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1778                 skb->dev ? skb->dev->name : "?");
1779         kfree_skb(skb);
1780         WARN_ON(1);
1781         return 0;
1782 }
1783
1784 /*
1785    We do not cache source address of outgoing interface,
1786    because it is used only by IP RR, TS and SRR options,
1787    so that it out of fast path.
1788
1789    BTW remember: "addr" is allowed to be not aligned
1790    in IP options!
1791  */
1792
1793 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1794 {
1795         __be32 src;
1796
1797         if (rt_is_output_route(rt))
1798                 src = ip_hdr(skb)->saddr;
1799         else {
1800                 struct fib_result res;
1801                 struct flowi4 fl4;
1802                 struct iphdr *iph;
1803
1804                 iph = ip_hdr(skb);
1805
1806                 memset(&fl4, 0, sizeof(fl4));
1807                 fl4.daddr = iph->daddr;
1808                 fl4.saddr = iph->saddr;
1809                 fl4.flowi4_tos = RT_TOS(iph->tos);
1810                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1811                 fl4.flowi4_iif = skb->dev->ifindex;
1812                 fl4.flowi4_mark = skb->mark;
1813
1814                 rcu_read_lock();
1815                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1816                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1817                 else
1818                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1819                                         RT_SCOPE_UNIVERSE);
1820                 rcu_read_unlock();
1821         }
1822         memcpy(addr, &src, 4);
1823 }
1824
1825 #ifdef CONFIG_IP_ROUTE_CLASSID
1826 static void set_class_tag(struct rtable *rt, u32 tag)
1827 {
1828         if (!(rt->dst.tclassid & 0xFFFF))
1829                 rt->dst.tclassid |= tag & 0xFFFF;
1830         if (!(rt->dst.tclassid & 0xFFFF0000))
1831                 rt->dst.tclassid |= tag & 0xFFFF0000;
1832 }
1833 #endif
1834
1835 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1836 {
1837         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1838
1839         if (advmss == 0) {
1840                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1841                                ip_rt_min_advmss);
1842                 if (advmss > 65535 - 40)
1843                         advmss = 65535 - 40;
1844         }
1845         return advmss;
1846 }
1847
1848 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1849 {
1850         unsigned int mtu = dst->dev->mtu;
1851
1852         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1853                 const struct rtable *rt = (const struct rtable *) dst;
1854
1855                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1856                         mtu = 576;
1857         }
1858
1859         if (mtu > IP_MAX_MTU)
1860                 mtu = IP_MAX_MTU;
1861
1862         return mtu;
1863 }
1864
1865 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1866                             struct fib_info *fi)
1867 {
1868         struct inet_peer *peer;
1869         int create = 0;
1870
1871         /* If a peer entry exists for this destination, we must hook
1872          * it up in order to get at cached metrics.
1873          */
1874         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1875                 create = 1;
1876
1877         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1878         if (peer) {
1879                 rt->rt_peer_genid = rt_peer_genid();
1880                 if (inet_metrics_new(peer))
1881                         memcpy(peer->metrics, fi->fib_metrics,
1882                                sizeof(u32) * RTAX_MAX);
1883                 dst_init_metrics(&rt->dst, peer->metrics, false);
1884
1885                 check_peer_pmtu(&rt->dst, peer);
1886                 if (peer->redirect_learned.a4 &&
1887                     peer->redirect_learned.a4 != rt->rt_gateway) {
1888                         rt->rt_gateway = peer->redirect_learned.a4;
1889                         rt->rt_flags |= RTCF_REDIRECTED;
1890                 }
1891         } else {
1892                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1893                         rt->fi = fi;
1894                         atomic_inc(&fi->fib_clntref);
1895                 }
1896                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1897         }
1898 }
1899
1900 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1901                            const struct fib_result *res,
1902                            struct fib_info *fi, u16 type, u32 itag)
1903 {
1904         struct dst_entry *dst = &rt->dst;
1905
1906         if (fi) {
1907                 if (FIB_RES_GW(*res) &&
1908                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1909                         rt->rt_gateway = FIB_RES_GW(*res);
1910                 rt_init_metrics(rt, fl4, fi);
1911 #ifdef CONFIG_IP_ROUTE_CLASSID
1912                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1913 #endif
1914         }
1915
1916         if (dst_mtu(dst) > IP_MAX_MTU)
1917                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1918         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1919                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1920
1921 #ifdef CONFIG_IP_ROUTE_CLASSID
1922 #ifdef CONFIG_IP_MULTIPLE_TABLES
1923         set_class_tag(rt, fib_rules_tclass(res));
1924 #endif
1925         set_class_tag(rt, itag);
1926 #endif
1927 }
1928
1929 static struct rtable *rt_dst_alloc(struct net_device *dev,
1930                                    bool nopolicy, bool noxfrm)
1931 {
1932         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1933                          DST_HOST |
1934                          (nopolicy ? DST_NOPOLICY : 0) |
1935                          (noxfrm ? DST_NOXFRM : 0));
1936 }
1937
1938 /* called in rcu_read_lock() section */
1939 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1940                                 u8 tos, struct net_device *dev, int our)
1941 {
1942         unsigned int hash;
1943         struct rtable *rth;
1944         __be32 spec_dst;
1945         struct in_device *in_dev = __in_dev_get_rcu(dev);
1946         u32 itag = 0;
1947         int err;
1948
1949         /* Primary sanity checks. */
1950
1951         if (in_dev == NULL)
1952                 return -EINVAL;
1953
1954         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1955             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1956                 goto e_inval;
1957
1958         if (ipv4_is_zeronet(saddr)) {
1959                 if (!ipv4_is_local_multicast(daddr))
1960                         goto e_inval;
1961                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1962         } else {
1963                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1964                                           &itag);
1965                 if (err < 0)
1966                         goto e_err;
1967         }
1968         rth = rt_dst_alloc(init_net.loopback_dev,
1969                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1970         if (!rth)
1971                 goto e_nobufs;
1972
1973 #ifdef CONFIG_IP_ROUTE_CLASSID
1974         rth->dst.tclassid = itag;
1975 #endif
1976         rth->dst.output = ip_rt_bug;
1977
1978         rth->rt_key_dst = daddr;
1979         rth->rt_key_src = saddr;
1980         rth->rt_genid   = rt_genid(dev_net(dev));
1981         rth->rt_flags   = RTCF_MULTICAST;
1982         rth->rt_type    = RTN_MULTICAST;
1983         rth->rt_key_tos = tos;
1984         rth->rt_dst     = daddr;
1985         rth->rt_src     = saddr;
1986         rth->rt_route_iif = dev->ifindex;
1987         rth->rt_iif     = dev->ifindex;
1988         rth->rt_oif     = 0;
1989         rth->rt_mark    = skb->mark;
1990         rth->rt_gateway = daddr;
1991         rth->rt_spec_dst= spec_dst;
1992         rth->rt_peer_genid = 0;
1993         rth->peer = NULL;
1994         rth->fi = NULL;
1995         if (our) {
1996                 rth->dst.input= ip_local_deliver;
1997                 rth->rt_flags |= RTCF_LOCAL;
1998         }
1999
2000 #ifdef CONFIG_IP_MROUTE
2001         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2002                 rth->dst.input = ip_mr_input;
2003 #endif
2004         RT_CACHE_STAT_INC(in_slow_mc);
2005
2006         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2007         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2008         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2009
2010 e_nobufs:
2011         return -ENOBUFS;
2012 e_inval:
2013         return -EINVAL;
2014 e_err:
2015         return err;
2016 }
2017
2018
2019 static void ip_handle_martian_source(struct net_device *dev,
2020                                      struct in_device *in_dev,
2021                                      struct sk_buff *skb,
2022                                      __be32 daddr,
2023                                      __be32 saddr)
2024 {
2025         RT_CACHE_STAT_INC(in_martian_src);
2026 #ifdef CONFIG_IP_ROUTE_VERBOSE
2027         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2028                 /*
2029                  *      RFC1812 recommendation, if source is martian,
2030                  *      the only hint is MAC header.
2031                  */
2032                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2033                         &daddr, &saddr, dev->name);
2034                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2035                         int i;
2036                         const unsigned char *p = skb_mac_header(skb);
2037                         printk(KERN_WARNING "ll header: ");
2038                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2039                                 printk("%02x", *p);
2040                                 if (i < (dev->hard_header_len - 1))
2041                                         printk(":");
2042                         }
2043                         printk("\n");
2044                 }
2045         }
2046 #endif
2047 }
2048
2049 /* called in rcu_read_lock() section */
2050 static int __mkroute_input(struct sk_buff *skb,
2051                            const struct fib_result *res,
2052                            struct in_device *in_dev,
2053                            __be32 daddr, __be32 saddr, u32 tos,
2054                            struct rtable **result)
2055 {
2056         struct rtable *rth;
2057         int err;
2058         struct in_device *out_dev;
2059         unsigned int flags = 0;
2060         __be32 spec_dst;
2061         u32 itag;
2062
2063         /* get a working reference to the output device */
2064         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2065         if (out_dev == NULL) {
2066                 if (net_ratelimit())
2067                         printk(KERN_CRIT "Bug in ip_route_input" \
2068                                "_slow(). Please, report\n");
2069                 return -EINVAL;
2070         }
2071
2072
2073         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2074                                   in_dev->dev, &spec_dst, &itag);
2075         if (err < 0) {
2076                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2077                                          saddr);
2078
2079                 goto cleanup;
2080         }
2081
2082         if (err)
2083                 flags |= RTCF_DIRECTSRC;
2084
2085         if (out_dev == in_dev && err &&
2086             (IN_DEV_SHARED_MEDIA(out_dev) ||
2087              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2088                 flags |= RTCF_DOREDIRECT;
2089
2090         if (skb->protocol != htons(ETH_P_IP)) {
2091                 /* Not IP (i.e. ARP). Do not create route, if it is
2092                  * invalid for proxy arp. DNAT routes are always valid.
2093                  *
2094                  * Proxy arp feature have been extended to allow, ARP
2095                  * replies back to the same interface, to support
2096                  * Private VLAN switch technologies. See arp.c.
2097                  */
2098                 if (out_dev == in_dev &&
2099                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2100                         err = -EINVAL;
2101                         goto cleanup;
2102                 }
2103         }
2104
2105         rth = rt_dst_alloc(out_dev->dev,
2106                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2107                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2108         if (!rth) {
2109                 err = -ENOBUFS;
2110                 goto cleanup;
2111         }
2112
2113         rth->rt_key_dst = daddr;
2114         rth->rt_key_src = saddr;
2115         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2116         rth->rt_flags = flags;
2117         rth->rt_type = res->type;
2118         rth->rt_key_tos = tos;
2119         rth->rt_dst     = daddr;
2120         rth->rt_src     = saddr;
2121         rth->rt_route_iif = in_dev->dev->ifindex;
2122         rth->rt_iif     = in_dev->dev->ifindex;
2123         rth->rt_oif     = 0;
2124         rth->rt_mark    = skb->mark;
2125         rth->rt_gateway = daddr;
2126         rth->rt_spec_dst= spec_dst;
2127         rth->rt_peer_genid = 0;
2128         rth->peer = NULL;
2129         rth->fi = NULL;
2130
2131         rth->dst.input = ip_forward;
2132         rth->dst.output = ip_output;
2133
2134         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2135
2136         *result = rth;
2137         err = 0;
2138  cleanup:
2139         return err;
2140 }
2141
2142 static int ip_mkroute_input(struct sk_buff *skb,
2143                             struct fib_result *res,
2144                             const struct flowi4 *fl4,
2145                             struct in_device *in_dev,
2146                             __be32 daddr, __be32 saddr, u32 tos)
2147 {
2148         struct rtable* rth = NULL;
2149         int err;
2150         unsigned hash;
2151
2152 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2153         if (res->fi && res->fi->fib_nhs > 1)
2154                 fib_select_multipath(res);
2155 #endif
2156
2157         /* create a routing cache entry */
2158         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2159         if (err)
2160                 return err;
2161
2162         /* put it into the cache */
2163         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2164                        rt_genid(dev_net(rth->dst.dev)));
2165         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2166         if (IS_ERR(rth))
2167                 return PTR_ERR(rth);
2168         return 0;
2169 }
2170
2171 /*
2172  *      NOTE. We drop all the packets that has local source
2173  *      addresses, because every properly looped back packet
2174  *      must have correct destination already attached by output routine.
2175  *
2176  *      Such approach solves two big problems:
2177  *      1. Not simplex devices are handled properly.
2178  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2179  *      called with rcu_read_lock()
2180  */
2181
2182 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2183                                u8 tos, struct net_device *dev)
2184 {
2185         struct fib_result res;
2186         struct in_device *in_dev = __in_dev_get_rcu(dev);
2187         struct flowi4   fl4;
2188         unsigned        flags = 0;
2189         u32             itag = 0;
2190         struct rtable * rth;
2191         unsigned        hash;
2192         __be32          spec_dst;
2193         int             err = -EINVAL;
2194         struct net    * net = dev_net(dev);
2195
2196         /* IP on this device is disabled. */
2197
2198         if (!in_dev)
2199                 goto out;
2200
2201         /* Check for the most weird martians, which can be not detected
2202            by fib_lookup.
2203          */
2204
2205         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2206             ipv4_is_loopback(saddr))
2207                 goto martian_source;
2208
2209         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2210                 goto brd_input;
2211
2212         /* Accept zero addresses only to limited broadcast;
2213          * I even do not know to fix it or not. Waiting for complains :-)
2214          */
2215         if (ipv4_is_zeronet(saddr))
2216                 goto martian_source;
2217
2218         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2219                 goto martian_destination;
2220
2221         /*
2222          *      Now we are ready to route packet.
2223          */
2224         fl4.flowi4_oif = 0;
2225         fl4.flowi4_iif = dev->ifindex;
2226         fl4.flowi4_mark = skb->mark;
2227         fl4.flowi4_tos = tos;
2228         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2229         fl4.daddr = daddr;
2230         fl4.saddr = saddr;
2231         err = fib_lookup(net, &fl4, &res);
2232         if (err != 0) {
2233                 if (!IN_DEV_FORWARD(in_dev))
2234                         goto e_hostunreach;
2235                 goto no_route;
2236         }
2237
2238         RT_CACHE_STAT_INC(in_slow_tot);
2239
2240         if (res.type == RTN_BROADCAST)
2241                 goto brd_input;
2242
2243         if (res.type == RTN_LOCAL) {
2244                 err = fib_validate_source(skb, saddr, daddr, tos,
2245                                           net->loopback_dev->ifindex,
2246                                           dev, &spec_dst, &itag);
2247                 if (err < 0)
2248                         goto martian_source_keep_err;
2249                 if (err)
2250                         flags |= RTCF_DIRECTSRC;
2251                 spec_dst = daddr;
2252                 goto local_input;
2253         }
2254
2255         if (!IN_DEV_FORWARD(in_dev))
2256                 goto e_hostunreach;
2257         if (res.type != RTN_UNICAST)
2258                 goto martian_destination;
2259
2260         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2261 out:    return err;
2262
2263 brd_input:
2264         if (skb->protocol != htons(ETH_P_IP))
2265                 goto e_inval;
2266
2267         if (ipv4_is_zeronet(saddr))
2268                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2269         else {
2270                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2271                                           &itag);
2272                 if (err < 0)
2273                         goto martian_source_keep_err;
2274                 if (err)
2275                         flags |= RTCF_DIRECTSRC;
2276         }
2277         flags |= RTCF_BROADCAST;
2278         res.type = RTN_BROADCAST;
2279         RT_CACHE_STAT_INC(in_brd);
2280
2281 local_input:
2282         rth = rt_dst_alloc(net->loopback_dev,
2283                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2284         if (!rth)
2285                 goto e_nobufs;
2286
2287         rth->dst.input= ip_local_deliver;
2288         rth->dst.output= ip_rt_bug;
2289 #ifdef CONFIG_IP_ROUTE_CLASSID
2290         rth->dst.tclassid = itag;
2291 #endif
2292
2293         rth->rt_key_dst = daddr;
2294         rth->rt_key_src = saddr;
2295         rth->rt_genid = rt_genid(net);
2296         rth->rt_flags   = flags|RTCF_LOCAL;
2297         rth->rt_type    = res.type;
2298         rth->rt_key_tos = tos;
2299         rth->rt_dst     = daddr;
2300         rth->rt_src     = saddr;
2301 #ifdef CONFIG_IP_ROUTE_CLASSID
2302         rth->dst.tclassid = itag;
2303 #endif
2304         rth->rt_route_iif = dev->ifindex;
2305         rth->rt_iif     = dev->ifindex;
2306         rth->rt_oif     = 0;
2307         rth->rt_mark    = skb->mark;
2308         rth->rt_gateway = daddr;
2309         rth->rt_spec_dst= spec_dst;
2310         rth->rt_peer_genid = 0;
2311         rth->peer = NULL;
2312         rth->fi = NULL;
2313         if (res.type == RTN_UNREACHABLE) {
2314                 rth->dst.input= ip_error;
2315                 rth->dst.error= -err;
2316                 rth->rt_flags   &= ~RTCF_LOCAL;
2317         }
2318         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2319         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2320         err = 0;
2321         if (IS_ERR(rth))
2322                 err = PTR_ERR(rth);
2323         goto out;
2324
2325 no_route:
2326         RT_CACHE_STAT_INC(in_no_route);
2327         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2328         res.type = RTN_UNREACHABLE;
2329         if (err == -ESRCH)
2330                 err = -ENETUNREACH;
2331         goto local_input;
2332
2333         /*
2334          *      Do not cache martian addresses: they should be logged (RFC1812)
2335          */
2336 martian_destination:
2337         RT_CACHE_STAT_INC(in_martian_dst);
2338 #ifdef CONFIG_IP_ROUTE_VERBOSE
2339         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2340                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2341                         &daddr, &saddr, dev->name);
2342 #endif
2343
2344 e_hostunreach:
2345         err = -EHOSTUNREACH;
2346         goto out;
2347
2348 e_inval:
2349         err = -EINVAL;
2350         goto out;
2351
2352 e_nobufs:
2353         err = -ENOBUFS;
2354         goto out;
2355
2356 martian_source:
2357         err = -EINVAL;
2358 martian_source_keep_err:
2359         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2360         goto out;
2361 }
2362
2363 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2364                            u8 tos, struct net_device *dev, bool noref)
2365 {
2366         struct rtable * rth;
2367         unsigned        hash;
2368         int iif = dev->ifindex;
2369         struct net *net;
2370         int res;
2371
2372         net = dev_net(dev);
2373
2374         rcu_read_lock();
2375
2376         if (!rt_caching(net))
2377                 goto skip_cache;
2378
2379         tos &= IPTOS_RT_MASK;
2380         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2381
2382         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2383              rth = rcu_dereference(rth->dst.rt_next)) {
2384                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2385                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2386                      (rth->rt_route_iif ^ iif) |
2387                      (rth->rt_key_tos ^ tos)) == 0 &&
2388                     rth->rt_mark == skb->mark &&
2389                     net_eq(dev_net(rth->dst.dev), net) &&
2390                     !rt_is_expired(rth)) {
2391                         if (noref) {
2392                                 dst_use_noref(&rth->dst, jiffies);
2393                                 skb_dst_set_noref(skb, &rth->dst);
2394                         } else {
2395                                 dst_use(&rth->dst, jiffies);
2396                                 skb_dst_set(skb, &rth->dst);
2397                         }
2398                         RT_CACHE_STAT_INC(in_hit);
2399                         rcu_read_unlock();
2400                         return 0;
2401                 }
2402                 RT_CACHE_STAT_INC(in_hlist_search);
2403         }
2404
2405 skip_cache:
2406         /* Multicast recognition logic is moved from route cache to here.
2407            The problem was that too many Ethernet cards have broken/missing
2408            hardware multicast filters :-( As result the host on multicasting
2409            network acquires a lot of useless route cache entries, sort of
2410            SDR messages from all the world. Now we try to get rid of them.
2411            Really, provided software IP multicast filter is organized
2412            reasonably (at least, hashed), it does not result in a slowdown
2413            comparing with route cache reject entries.
2414            Note, that multicast routers are not affected, because
2415            route cache entry is created eventually.
2416          */
2417         if (ipv4_is_multicast(daddr)) {
2418                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2419
2420                 if (in_dev) {
2421                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2422                                                   ip_hdr(skb)->protocol);
2423                         if (our
2424 #ifdef CONFIG_IP_MROUTE
2425                                 ||
2426                             (!ipv4_is_local_multicast(daddr) &&
2427                              IN_DEV_MFORWARD(in_dev))
2428 #endif
2429                            ) {
2430                                 int res = ip_route_input_mc(skb, daddr, saddr,
2431                                                             tos, dev, our);
2432                                 rcu_read_unlock();
2433                                 return res;
2434                         }
2435                 }
2436                 rcu_read_unlock();
2437                 return -EINVAL;
2438         }
2439         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2440         rcu_read_unlock();
2441         return res;
2442 }
2443 EXPORT_SYMBOL(ip_route_input_common);
2444
2445 /* called with rcu_read_lock() */
2446 static struct rtable *__mkroute_output(const struct fib_result *res,
2447                                        const struct flowi4 *fl4,
2448                                        __be32 orig_daddr, __be32 orig_saddr,
2449                                        int orig_oif, struct net_device *dev_out,
2450                                        unsigned int flags)
2451 {
2452         struct fib_info *fi = res->fi;
2453         u32 tos = RT_FL_TOS(fl4);
2454         struct in_device *in_dev;
2455         u16 type = res->type;
2456         struct rtable *rth;
2457
2458         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2459                 return ERR_PTR(-EINVAL);
2460
2461         if (ipv4_is_lbcast(fl4->daddr))
2462                 type = RTN_BROADCAST;
2463         else if (ipv4_is_multicast(fl4->daddr))
2464                 type = RTN_MULTICAST;
2465         else if (ipv4_is_zeronet(fl4->daddr))
2466                 return ERR_PTR(-EINVAL);
2467
2468         if (dev_out->flags & IFF_LOOPBACK)
2469                 flags |= RTCF_LOCAL;
2470
2471         in_dev = __in_dev_get_rcu(dev_out);
2472         if (!in_dev)
2473                 return ERR_PTR(-EINVAL);
2474
2475         if (type == RTN_BROADCAST) {
2476                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2477                 fi = NULL;
2478         } else if (type == RTN_MULTICAST) {
2479                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2480                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2481                                      fl4->flowi4_proto))
2482                         flags &= ~RTCF_LOCAL;
2483                 /* If multicast route do not exist use
2484                  * default one, but do not gateway in this case.
2485                  * Yes, it is hack.
2486                  */
2487                 if (fi && res->prefixlen < 4)
2488                         fi = NULL;
2489         }
2490
2491         rth = rt_dst_alloc(dev_out,
2492                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2493                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2494         if (!rth)
2495                 return ERR_PTR(-ENOBUFS);
2496
2497         rth->dst.output = ip_output;
2498
2499         rth->rt_key_dst = orig_daddr;
2500         rth->rt_key_src = orig_saddr;
2501         rth->rt_genid = rt_genid(dev_net(dev_out));
2502         rth->rt_flags   = flags;
2503         rth->rt_type    = type;
2504         rth->rt_key_tos = tos;
2505         rth->rt_dst     = fl4->daddr;
2506         rth->rt_src     = fl4->saddr;
2507         rth->rt_route_iif = 0;
2508         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2509         rth->rt_oif     = orig_oif;
2510         rth->rt_mark    = fl4->flowi4_mark;
2511         rth->rt_gateway = fl4->daddr;
2512         rth->rt_spec_dst= fl4->saddr;
2513         rth->rt_peer_genid = 0;
2514         rth->peer = NULL;
2515         rth->fi = NULL;
2516
2517         RT_CACHE_STAT_INC(out_slow_tot);
2518
2519         if (flags & RTCF_LOCAL) {
2520                 rth->dst.input = ip_local_deliver;
2521                 rth->rt_spec_dst = fl4->daddr;
2522         }
2523         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2524                 rth->rt_spec_dst = fl4->saddr;
2525                 if (flags & RTCF_LOCAL &&
2526                     !(dev_out->flags & IFF_LOOPBACK)) {
2527                         rth->dst.output = ip_mc_output;
2528                         RT_CACHE_STAT_INC(out_slow_mc);
2529                 }
2530 #ifdef CONFIG_IP_MROUTE
2531                 if (type == RTN_MULTICAST) {
2532                         if (IN_DEV_MFORWARD(in_dev) &&
2533                             !ipv4_is_local_multicast(fl4->daddr)) {
2534                                 rth->dst.input = ip_mr_input;
2535                                 rth->dst.output = ip_mc_output;
2536                         }
2537                 }
2538 #endif
2539         }
2540
2541         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2542
2543         return rth;
2544 }
2545
2546 /*
2547  * Major route resolver routine.
2548  * called with rcu_read_lock();
2549  */
2550
2551 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2552 {
2553         struct net_device *dev_out = NULL;
2554         u32 tos = RT_FL_TOS(fl4);
2555         unsigned int flags = 0;
2556         struct fib_result res;
2557         struct rtable *rth;
2558         __be32 orig_daddr;
2559         __be32 orig_saddr;
2560         int orig_oif;
2561
2562         res.fi          = NULL;
2563 #ifdef CONFIG_IP_MULTIPLE_TABLES
2564         res.r           = NULL;
2565 #endif
2566
2567         orig_daddr = fl4->daddr;
2568         orig_saddr = fl4->saddr;
2569         orig_oif = fl4->flowi4_oif;
2570
2571         fl4->flowi4_iif = net->loopback_dev->ifindex;
2572         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2573         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2574                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2575
2576         rcu_read_lock();
2577         if (fl4->saddr) {
2578                 rth = ERR_PTR(-EINVAL);
2579                 if (ipv4_is_multicast(fl4->saddr) ||
2580                     ipv4_is_lbcast(fl4->saddr) ||
2581                     ipv4_is_zeronet(fl4->saddr))
2582                         goto out;
2583
2584                 /* I removed check for oif == dev_out->oif here.
2585                    It was wrong for two reasons:
2586                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2587                       is assigned to multiple interfaces.
2588                    2. Moreover, we are allowed to send packets with saddr
2589                       of another iface. --ANK
2590                  */
2591
2592                 if (fl4->flowi4_oif == 0 &&
2593                     (ipv4_is_multicast(fl4->daddr) ||
2594                      ipv4_is_lbcast(fl4->daddr))) {
2595                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2596                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2597                         if (dev_out == NULL)
2598                                 goto out;
2599
2600                         /* Special hack: user can direct multicasts
2601                            and limited broadcast via necessary interface
2602                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2603                            This hack is not just for fun, it allows
2604                            vic,vat and friends to work.
2605                            They bind socket to loopback, set ttl to zero
2606                            and expect that it will work.
2607                            From the viewpoint of routing cache they are broken,
2608                            because we are not allowed to build multicast path
2609                            with loopback source addr (look, routing cache
2610                            cannot know, that ttl is zero, so that packet
2611                            will not leave this host and route is valid).
2612                            Luckily, this hack is good workaround.
2613                          */
2614
2615                         fl4->flowi4_oif = dev_out->ifindex;
2616                         goto make_route;
2617                 }
2618
2619                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2620                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2621                         if (!__ip_dev_find(net, fl4->saddr, false))
2622                                 goto out;
2623                 }
2624         }
2625
2626
2627         if (fl4->flowi4_oif) {
2628                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2629                 rth = ERR_PTR(-ENODEV);
2630                 if (dev_out == NULL)
2631                         goto out;
2632
2633                 /* RACE: Check return value of inet_select_addr instead. */
2634                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2635                         rth = ERR_PTR(-ENETUNREACH);
2636                         goto out;
2637                 }
2638                 if (ipv4_is_local_multicast(fl4->daddr) ||
2639                     ipv4_is_lbcast(fl4->daddr)) {
2640                         if (!fl4->saddr)
2641                                 fl4->saddr = inet_select_addr(dev_out, 0,
2642                                                               RT_SCOPE_LINK);
2643                         goto make_route;
2644                 }
2645                 if (fl4->saddr) {
2646                         if (ipv4_is_multicast(fl4->daddr))
2647                                 fl4->saddr = inet_select_addr(dev_out, 0,
2648                                                               fl4->flowi4_scope);
2649                         else if (!fl4->daddr)
2650                                 fl4->saddr = inet_select_addr(dev_out, 0,
2651                                                               RT_SCOPE_HOST);
2652                 }
2653         }
2654
2655         if (!fl4->daddr) {
2656                 fl4->daddr = fl4->saddr;
2657                 if (!fl4->daddr)
2658                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2659                 dev_out = net->loopback_dev;
2660                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2661                 res.type = RTN_LOCAL;
2662                 flags |= RTCF_LOCAL;
2663                 goto make_route;
2664         }
2665
2666         if (fib_lookup(net, fl4, &res)) {
2667                 res.fi = NULL;
2668                 if (fl4->flowi4_oif) {
2669                         /* Apparently, routing tables are wrong. Assume,
2670                            that the destination is on link.
2671
2672                            WHY? DW.
2673                            Because we are allowed to send to iface
2674                            even if it has NO routes and NO assigned
2675                            addresses. When oif is specified, routing
2676                            tables are looked up with only one purpose:
2677                            to catch if destination is gatewayed, rather than
2678                            direct. Moreover, if MSG_DONTROUTE is set,
2679                            we send packet, ignoring both routing tables
2680                            and ifaddr state. --ANK
2681
2682
2683                            We could make it even if oif is unknown,
2684                            likely IPv6, but we do not.
2685                          */
2686
2687                         if (fl4->saddr == 0)
2688                                 fl4->saddr = inet_select_addr(dev_out, 0,
2689                                                               RT_SCOPE_LINK);
2690                         res.type = RTN_UNICAST;
2691                         goto make_route;
2692                 }
2693                 rth = ERR_PTR(-ENETUNREACH);
2694                 goto out;
2695         }
2696
2697         if (res.type == RTN_LOCAL) {
2698                 if (!fl4->saddr) {
2699                         if (res.fi->fib_prefsrc)
2700                                 fl4->saddr = res.fi->fib_prefsrc;
2701                         else
2702                                 fl4->saddr = fl4->daddr;
2703                 }
2704                 dev_out = net->loopback_dev;
2705                 fl4->flowi4_oif = dev_out->ifindex;
2706                 res.fi = NULL;
2707                 flags |= RTCF_LOCAL;
2708                 goto make_route;
2709         }
2710
2711 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2712         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2713                 fib_select_multipath(&res);
2714         else
2715 #endif
2716         if (!res.prefixlen &&
2717             res.table->tb_num_default > 1 &&
2718             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2719                 fib_select_default(&res);
2720
2721         if (!fl4->saddr)
2722                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2723
2724         dev_out = FIB_RES_DEV(res);
2725         fl4->flowi4_oif = dev_out->ifindex;
2726
2727
2728 make_route:
2729         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2730                                dev_out, flags);
2731         if (!IS_ERR(rth)) {
2732                 unsigned int hash;
2733
2734                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2735                                rt_genid(dev_net(dev_out)));
2736                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2737         }
2738
2739 out:
2740         rcu_read_unlock();
2741         return rth;
2742 }
2743
2744 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2745 {
2746         struct rtable *rth;
2747         unsigned int hash;
2748
2749         if (!rt_caching(net))
2750                 goto slow_output;
2751
2752         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2753
2754         rcu_read_lock_bh();
2755         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2756                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2757                 if (rth->rt_key_dst == flp4->daddr &&
2758                     rth->rt_key_src == flp4->saddr &&
2759                     rt_is_output_route(rth) &&
2760                     rth->rt_oif == flp4->flowi4_oif &&
2761                     rth->rt_mark == flp4->flowi4_mark &&
2762                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2763                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2764                     net_eq(dev_net(rth->dst.dev), net) &&
2765                     !rt_is_expired(rth)) {
2766                         dst_use(&rth->dst, jiffies);
2767                         RT_CACHE_STAT_INC(out_hit);
2768                         rcu_read_unlock_bh();
2769                         if (!flp4->saddr)
2770                                 flp4->saddr = rth->rt_src;
2771                         if (!flp4->daddr)
2772                                 flp4->daddr = rth->rt_dst;
2773                         return rth;
2774                 }
2775                 RT_CACHE_STAT_INC(out_hlist_search);
2776         }
2777         rcu_read_unlock_bh();
2778
2779 slow_output:
2780         return ip_route_output_slow(net, flp4);
2781 }
2782 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2783
2784 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2785 {
2786         return NULL;
2787 }
2788
2789 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2790 {
2791         return 0;
2792 }
2793
2794 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2795 {
2796 }
2797
2798 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2799                                           unsigned long old)
2800 {
2801         return NULL;
2802 }
2803
2804 static struct dst_ops ipv4_dst_blackhole_ops = {
2805         .family                 =       AF_INET,
2806         .protocol               =       cpu_to_be16(ETH_P_IP),
2807         .destroy                =       ipv4_dst_destroy,
2808         .check                  =       ipv4_blackhole_dst_check,
2809         .default_mtu            =       ipv4_blackhole_default_mtu,
2810         .default_advmss         =       ipv4_default_advmss,
2811         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2812         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2813 };
2814
2815 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2816 {
2817         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2818         struct rtable *ort = (struct rtable *) dst_orig;
2819
2820         if (rt) {
2821                 struct dst_entry *new = &rt->dst;
2822
2823                 new->__use = 1;
2824                 new->input = dst_discard;
2825                 new->output = dst_discard;
2826                 dst_copy_metrics(new, &ort->dst);
2827
2828                 new->dev = ort->dst.dev;
2829                 if (new->dev)
2830                         dev_hold(new->dev);
2831
2832                 rt->rt_key_dst = ort->rt_key_dst;
2833                 rt->rt_key_src = ort->rt_key_src;
2834                 rt->rt_key_tos = ort->rt_key_tos;
2835                 rt->rt_route_iif = ort->rt_route_iif;
2836                 rt->rt_iif = ort->rt_iif;
2837                 rt->rt_oif = ort->rt_oif;
2838                 rt->rt_mark = ort->rt_mark;
2839
2840                 rt->rt_genid = rt_genid(net);
2841                 rt->rt_flags = ort->rt_flags;
2842                 rt->rt_type = ort->rt_type;
2843                 rt->rt_dst = ort->rt_dst;
2844                 rt->rt_src = ort->rt_src;
2845                 rt->rt_gateway = ort->rt_gateway;
2846                 rt->rt_spec_dst = ort->rt_spec_dst;
2847                 rt->peer = ort->peer;
2848                 if (rt->peer)
2849                         atomic_inc(&rt->peer->refcnt);
2850                 rt->fi = ort->fi;
2851                 if (rt->fi)
2852                         atomic_inc(&rt->fi->fib_clntref);
2853
2854                 dst_free(new);
2855         }
2856
2857         dst_release(dst_orig);
2858
2859         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2860 }
2861
2862 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2863                                     struct sock *sk)
2864 {
2865         struct rtable *rt = __ip_route_output_key(net, flp4);
2866
2867         if (IS_ERR(rt))
2868                 return rt;
2869
2870         if (flp4->flowi4_proto)
2871                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2872                                                    flowi4_to_flowi(flp4),
2873                                                    sk, 0);
2874
2875         return rt;
2876 }
2877 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2878
2879 static int rt_fill_info(struct net *net,
2880                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2881                         int nowait, unsigned int flags)
2882 {
2883         struct rtable *rt = skb_rtable(skb);
2884         struct rtmsg *r;
2885         struct nlmsghdr *nlh;
2886         long expires = 0;
2887         const struct inet_peer *peer = rt->peer;
2888         u32 id = 0, ts = 0, tsage = 0, error;
2889
2890         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2891         if (nlh == NULL)
2892                 return -EMSGSIZE;
2893
2894         r = nlmsg_data(nlh);
2895         r->rtm_family    = AF_INET;
2896         r->rtm_dst_len  = 32;
2897         r->rtm_src_len  = 0;
2898         r->rtm_tos      = rt->rt_key_tos;
2899         r->rtm_table    = RT_TABLE_MAIN;
2900         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2901         r->rtm_type     = rt->rt_type;
2902         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2903         r->rtm_protocol = RTPROT_UNSPEC;
2904         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2905         if (rt->rt_flags & RTCF_NOTIFY)
2906                 r->rtm_flags |= RTM_F_NOTIFY;
2907
2908         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2909
2910         if (rt->rt_key_src) {
2911                 r->rtm_src_len = 32;
2912                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2913         }
2914         if (rt->dst.dev)
2915                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2916 #ifdef CONFIG_IP_ROUTE_CLASSID
2917         if (rt->dst.tclassid)
2918                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2919 #endif
2920         if (rt_is_input_route(rt))
2921                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2922         else if (rt->rt_src != rt->rt_key_src)
2923                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2924
2925         if (rt->rt_dst != rt->rt_gateway)
2926                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2927
2928         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2929                 goto nla_put_failure;
2930
2931         if (rt->rt_mark)
2932                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2933
2934         error = rt->dst.error;
2935         if (peer) {
2936                 inet_peer_refcheck(rt->peer);
2937                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2938                 if (peer->tcp_ts_stamp) {
2939                         ts = peer->tcp_ts;
2940                         tsage = get_seconds() - peer->tcp_ts_stamp;
2941                 }
2942                 expires = ACCESS_ONCE(peer->pmtu_expires);
2943                 if (expires)
2944                         expires -= jiffies;
2945         }
2946
2947         if (rt_is_input_route(rt)) {
2948 #ifdef CONFIG_IP_MROUTE
2949                 __be32 dst = rt->rt_dst;
2950
2951                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2952                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2953                         int err = ipmr_get_route(net, skb,
2954                                                  rt->rt_src, rt->rt_dst,
2955                                                  r, nowait);
2956                         if (err <= 0) {
2957                                 if (!nowait) {
2958                                         if (err == 0)
2959                                                 return 0;
2960                                         goto nla_put_failure;
2961                                 } else {
2962                                         if (err == -EMSGSIZE)
2963                                                 goto nla_put_failure;
2964                                         error = err;
2965                                 }
2966                         }
2967                 } else
2968 #endif
2969                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2970         }
2971
2972         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2973                                expires, error) < 0)
2974                 goto nla_put_failure;
2975
2976         return nlmsg_end(skb, nlh);
2977
2978 nla_put_failure:
2979         nlmsg_cancel(skb, nlh);
2980         return -EMSGSIZE;
2981 }
2982
2983 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2984 {
2985         struct net *net = sock_net(in_skb->sk);
2986         struct rtmsg *rtm;
2987         struct nlattr *tb[RTA_MAX+1];
2988         struct rtable *rt = NULL;
2989         __be32 dst = 0;
2990         __be32 src = 0;
2991         u32 iif;
2992         int err;
2993         int mark;
2994         struct sk_buff *skb;
2995
2996         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2997         if (err < 0)
2998                 goto errout;
2999
3000         rtm = nlmsg_data(nlh);
3001
3002         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3003         if (skb == NULL) {
3004                 err = -ENOBUFS;
3005                 goto errout;
3006         }
3007
3008         /* Reserve room for dummy headers, this skb can pass
3009            through good chunk of routing engine.
3010          */
3011         skb_reset_mac_header(skb);
3012         skb_reset_network_header(skb);
3013
3014         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3015         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3016         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3017
3018         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3019         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3020         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3021         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3022
3023         if (iif) {
3024                 struct net_device *dev;
3025
3026                 dev = __dev_get_by_index(net, iif);
3027                 if (dev == NULL) {
3028                         err = -ENODEV;
3029                         goto errout_free;
3030                 }
3031
3032                 skb->protocol   = htons(ETH_P_IP);
3033                 skb->dev        = dev;
3034                 skb->mark       = mark;
3035                 local_bh_disable();
3036                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3037                 local_bh_enable();
3038
3039                 rt = skb_rtable(skb);
3040                 if (err == 0 && rt->dst.error)
3041                         err = -rt->dst.error;
3042         } else {
3043                 struct flowi4 fl4 = {
3044                         .daddr = dst,
3045                         .saddr = src,
3046                         .flowi4_tos = rtm->rtm_tos,
3047                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3048                         .flowi4_mark = mark,
3049                 };
3050                 rt = ip_route_output_key(net, &fl4);
3051
3052                 err = 0;
3053                 if (IS_ERR(rt))
3054                         err = PTR_ERR(rt);
3055         }
3056
3057         if (err)
3058                 goto errout_free;
3059
3060         skb_dst_set(skb, &rt->dst);
3061         if (rtm->rtm_flags & RTM_F_NOTIFY)
3062                 rt->rt_flags |= RTCF_NOTIFY;
3063
3064         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3065                            RTM_NEWROUTE, 0, 0);
3066         if (err <= 0)
3067                 goto errout_free;
3068
3069         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3070 errout:
3071         return err;
3072
3073 errout_free:
3074         kfree_skb(skb);
3075         goto errout;
3076 }
3077
3078 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3079 {
3080         struct rtable *rt;
3081         int h, s_h;
3082         int idx, s_idx;
3083         struct net *net;
3084
3085         net = sock_net(skb->sk);
3086
3087         s_h = cb->args[0];
3088         if (s_h < 0)
3089                 s_h = 0;
3090         s_idx = idx = cb->args[1];
3091         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3092                 if (!rt_hash_table[h].chain)
3093                         continue;
3094                 rcu_read_lock_bh();
3095                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3096                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3097                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3098                                 continue;
3099                         if (rt_is_expired(rt))
3100                                 continue;
3101                         skb_dst_set_noref(skb, &rt->dst);
3102                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3103                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3104                                          1, NLM_F_MULTI) <= 0) {
3105                                 skb_dst_drop(skb);
3106                                 rcu_read_unlock_bh();
3107                                 goto done;
3108                         }
3109                         skb_dst_drop(skb);
3110                 }
3111                 rcu_read_unlock_bh();
3112         }
3113
3114 done:
3115         cb->args[0] = h;
3116         cb->args[1] = idx;
3117         return skb->len;
3118 }
3119
3120 void ip_rt_multicast_event(struct in_device *in_dev)
3121 {
3122         rt_cache_flush(dev_net(in_dev->dev), 0);
3123 }
3124
3125 #ifdef CONFIG_SYSCTL
3126 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3127                                         void __user *buffer,
3128                                         size_t *lenp, loff_t *ppos)
3129 {
3130         if (write) {
3131                 int flush_delay;
3132                 ctl_table ctl;
3133                 struct net *net;
3134
3135                 memcpy(&ctl, __ctl, sizeof(ctl));
3136                 ctl.data = &flush_delay;
3137                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3138
3139                 net = (struct net *)__ctl->extra1;
3140                 rt_cache_flush(net, flush_delay);
3141                 return 0;
3142         }
3143
3144         return -EINVAL;
3145 }
3146
3147 static ctl_table ipv4_route_table[] = {
3148         {
3149                 .procname       = "gc_thresh",
3150                 .data           = &ipv4_dst_ops.gc_thresh,
3151                 .maxlen         = sizeof(int),
3152                 .mode           = 0644,
3153                 .proc_handler   = proc_dointvec,
3154         },
3155         {
3156                 .procname       = "max_size",
3157                 .data           = &ip_rt_max_size,
3158                 .maxlen         = sizeof(int),
3159                 .mode           = 0644,
3160                 .proc_handler   = proc_dointvec,
3161         },
3162         {
3163                 /*  Deprecated. Use gc_min_interval_ms */
3164
3165                 .procname       = "gc_min_interval",
3166                 .data           = &ip_rt_gc_min_interval,
3167                 .maxlen         = sizeof(int),
3168                 .mode           = 0644,
3169                 .proc_handler   = proc_dointvec_jiffies,
3170         },
3171         {
3172                 .procname       = "gc_min_interval_ms",
3173                 .data           = &ip_rt_gc_min_interval,
3174                 .maxlen         = sizeof(int),
3175                 .mode           = 0644,
3176                 .proc_handler   = proc_dointvec_ms_jiffies,
3177         },
3178         {
3179                 .procname       = "gc_timeout",
3180                 .data           = &ip_rt_gc_timeout,
3181                 .maxlen         = sizeof(int),
3182                 .mode           = 0644,
3183                 .proc_handler   = proc_dointvec_jiffies,
3184         },
3185         {
3186                 .procname       = "gc_interval",
3187                 .data           = &ip_rt_gc_interval,
3188                 .maxlen         = sizeof(int),
3189                 .mode           = 0644,
3190                 .proc_handler   = proc_dointvec_jiffies,
3191         },
3192         {
3193                 .procname       = "gc_interval",
3194                 .data           = &ip_rt_gc_interval,
3195                 .maxlen         = sizeof(int),
3196                 .mode           = 0644,
3197                 .proc_handler   = proc_dointvec_jiffies,
3198         },
3199         {
3200                 .procname       = "redirect_load",
3201                 .data           = &ip_rt_redirect_load,
3202                 .maxlen         = sizeof(int),
3203                 .mode           = 0644,
3204                 .proc_handler   = proc_dointvec,
3205         },
3206         {
3207                 .procname       = "redirect_number",
3208                 .data           = &ip_rt_redirect_number,
3209                 .maxlen         = sizeof(int),
3210                 .mode           = 0644,
3211                 .proc_handler   = proc_dointvec,
3212         },
3213         {
3214                 .procname       = "redirect_silence",
3215                 .data           = &ip_rt_redirect_silence,
3216                 .maxlen         = sizeof(int),
3217                 .mode           = 0644,
3218                 .proc_handler   = proc_dointvec,
3219         },
3220         {
3221                 .procname       = "error_cost",
3222                 .data           = &ip_rt_error_cost,
3223                 .maxlen         = sizeof(int),
3224                 .mode           = 0644,
3225                 .proc_handler   = proc_dointvec,
3226         },
3227         {
3228                 .procname       = "error_burst",
3229                 .data           = &ip_rt_error_burst,
3230                 .maxlen         = sizeof(int),
3231                 .mode           = 0644,
3232                 .proc_handler   = proc_dointvec,
3233         },
3234         {
3235                 .procname       = "gc_elasticity",
3236                 .data           = &ip_rt_gc_elasticity,
3237                 .maxlen         = sizeof(int),
3238                 .mode           = 0644,
3239                 .proc_handler   = proc_dointvec,
3240         },
3241         {
3242                 .procname       = "mtu_expires",
3243                 .data           = &ip_rt_mtu_expires,
3244                 .maxlen         = sizeof(int),
3245                 .mode           = 0644,
3246                 .proc_handler   = proc_dointvec_jiffies,
3247         },
3248         {
3249                 .procname       = "min_pmtu",
3250                 .data           = &ip_rt_min_pmtu,
3251                 .maxlen         = sizeof(int),
3252                 .mode           = 0644,
3253                 .proc_handler   = proc_dointvec,
3254         },
3255         {
3256                 .procname       = "min_adv_mss",
3257                 .data           = &ip_rt_min_advmss,
3258                 .maxlen         = sizeof(int),
3259                 .mode           = 0644,
3260                 .proc_handler   = proc_dointvec,
3261         },
3262         { }
3263 };
3264
3265 static struct ctl_table empty[1];
3266
3267 static struct ctl_table ipv4_skeleton[] =
3268 {
3269         { .procname = "route",
3270           .mode = 0555, .child = ipv4_route_table},
3271         { .procname = "neigh",
3272           .mode = 0555, .child = empty},
3273         { }
3274 };
3275
3276 static __net_initdata struct ctl_path ipv4_path[] = {
3277         { .procname = "net", },
3278         { .procname = "ipv4", },
3279         { },
3280 };
3281
3282 static struct ctl_table ipv4_route_flush_table[] = {
3283         {
3284                 .procname       = "flush",
3285                 .maxlen         = sizeof(int),
3286                 .mode           = 0200,
3287                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3288         },
3289         { },
3290 };
3291
3292 static __net_initdata struct ctl_path ipv4_route_path[] = {
3293         { .procname = "net", },
3294         { .procname = "ipv4", },
3295         { .procname = "route", },
3296         { },
3297 };
3298
3299 static __net_init int sysctl_route_net_init(struct net *net)
3300 {
3301         struct ctl_table *tbl;
3302
3303         tbl = ipv4_route_flush_table;
3304         if (!net_eq(net, &init_net)) {
3305                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3306                 if (tbl == NULL)
3307                         goto err_dup;
3308         }
3309         tbl[0].extra1 = net;
3310
3311         net->ipv4.route_hdr =
3312                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3313         if (net->ipv4.route_hdr == NULL)
3314                 goto err_reg;
3315         return 0;
3316
3317 err_reg:
3318         if (tbl != ipv4_route_flush_table)
3319                 kfree(tbl);
3320 err_dup:
3321         return -ENOMEM;
3322 }
3323
3324 static __net_exit void sysctl_route_net_exit(struct net *net)
3325 {
3326         struct ctl_table *tbl;
3327
3328         tbl = net->ipv4.route_hdr->ctl_table_arg;
3329         unregister_net_sysctl_table(net->ipv4.route_hdr);
3330         BUG_ON(tbl == ipv4_route_flush_table);
3331         kfree(tbl);
3332 }
3333
3334 static __net_initdata struct pernet_operations sysctl_route_ops = {
3335         .init = sysctl_route_net_init,
3336         .exit = sysctl_route_net_exit,
3337 };
3338 #endif
3339
3340 static __net_init int rt_genid_init(struct net *net)
3341 {
3342         get_random_bytes(&net->ipv4.rt_genid,
3343                          sizeof(net->ipv4.rt_genid));
3344         get_random_bytes(&net->ipv4.dev_addr_genid,
3345                          sizeof(net->ipv4.dev_addr_genid));
3346         return 0;
3347 }
3348
3349 static __net_initdata struct pernet_operations rt_genid_ops = {
3350         .init = rt_genid_init,
3351 };
3352
3353
3354 #ifdef CONFIG_IP_ROUTE_CLASSID
3355 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3356 #endif /* CONFIG_IP_ROUTE_CLASSID */
3357
3358 static __initdata unsigned long rhash_entries;
3359 static int __init set_rhash_entries(char *str)
3360 {
3361         if (!str)
3362                 return 0;
3363         rhash_entries = simple_strtoul(str, &str, 0);
3364         return 1;
3365 }
3366 __setup("rhash_entries=", set_rhash_entries);
3367
3368 int __init ip_rt_init(void)
3369 {
3370         int rc = 0;
3371
3372 #ifdef CONFIG_IP_ROUTE_CLASSID
3373         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3374         if (!ip_rt_acct)
3375                 panic("IP: failed to allocate ip_rt_acct\n");
3376 #endif
3377
3378         ipv4_dst_ops.kmem_cachep =
3379                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3380                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3381
3382         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3383
3384         if (dst_entries_init(&ipv4_dst_ops) < 0)
3385                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3386
3387         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3388                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3389
3390         rt_hash_table = (struct rt_hash_bucket *)
3391                 alloc_large_system_hash("IP route cache",
3392                                         sizeof(struct rt_hash_bucket),
3393                                         rhash_entries,
3394                                         (totalram_pages >= 128 * 1024) ?
3395                                         15 : 17,
3396                                         0,
3397                                         &rt_hash_log,
3398                                         &rt_hash_mask,
3399                                         rhash_entries ? 0 : 512 * 1024);
3400         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3401         rt_hash_lock_init();
3402
3403         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3404         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3405
3406         devinet_init();
3407         ip_fib_init();
3408
3409         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3410         expires_ljiffies = jiffies;
3411         schedule_delayed_work(&expires_work,
3412                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3413
3414         if (ip_rt_proc_init())
3415                 printk(KERN_ERR "Unable to create route proc files\n");
3416 #ifdef CONFIG_XFRM
3417         xfrm_init();
3418         xfrm4_init(ip_rt_max_size);
3419 #endif
3420         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3421
3422 #ifdef CONFIG_SYSCTL
3423         register_pernet_subsys(&sysctl_route_ops);
3424 #endif
3425         register_pernet_subsys(&rt_genid_ops);
3426         return rc;
3427 }
3428
3429 #ifdef CONFIG_SYSCTL
3430 /*
3431  * We really need to sanitize the damn ipv4 init order, then all
3432  * this nonsense will go away.
3433  */
3434 void __init ip_static_sysctl_init(void)
3435 {
3436         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3437 }
3438 #endif