net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/atmclip.h>
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_gc_elasticity __read_mostly    = 8;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133 static int rt_chain_length_max __read_mostly    = 20;
 134 static int redirect_genid;
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 143 static void              ipv4_dst_destroy(struct dst_entry *dst);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 147 static int rt_garbage_collect(struct dst_ops *ops);
 148
 149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 150                             int how)
 151 {
 152 }
 153
 154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 155 {
 156         struct rtable *rt = (struct rtable *) dst;
 157         struct inet_peer *peer;
 158         u32 *p = NULL;
 159
 160         if (!rt->peer)
 161                 rt_bind_peer(rt, rt->rt_dst, 1);
 162
 163         peer = rt->peer;
 164         if (peer) {
 165                 u32 *old_p = __DST_METRICS_PTR(old);
 166                 unsigned long prev, new;
 167
 168                 p = peer->metrics;
 169                 if (inet_metrics_new(peer))
 170                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 171
 172                 new = (unsigned long) p;
 173                 prev = cmpxchg(&dst->_metrics, old, new);
 174
 175                 if (prev != old) {
 176                         p = __DST_METRICS_PTR(prev);
 177                         if (prev & DST_METRICS_READ_ONLY)
 178                                 p = NULL;
 179                 } else {
 180                         if (rt->fi) {
 181                                 fib_info_put(rt->fi);
 182                                 rt->fi = NULL;
 183                         }
 184                 }
 185         }
 186         return p;
 187 }
 188
 189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 190
 191 static struct dst_ops ipv4_dst_ops = {
 192         .family =               AF_INET,
 193         .protocol =             cpu_to_be16(ETH_P_IP),
 194         .gc =                   rt_garbage_collect,
 195         .check =                ipv4_dst_check,
 196         .default_advmss =       ipv4_default_advmss,
 197         .mtu =                  ipv4_mtu,
 198         .cow_metrics =          ipv4_cow_metrics,
 199         .destroy =              ipv4_dst_destroy,
 200         .ifdown =               ipv4_dst_ifdown,
 201         .negative_advice =      ipv4_negative_advice,
 202         .link_failure =         ipv4_link_failure,
 203         .update_pmtu =          ip_rt_update_pmtu,
 204         .local_out =            __ip_local_out,
 205         .neigh_lookup =         ipv4_neigh_lookup,
 206 };
 207
 208 #define ECN_OR_COST(class)      TC_PRIO_##class
 209
 210 const __u8 ip_tos2prio[16] = {
 211         TC_PRIO_BESTEFFORT,
 212         ECN_OR_COST(BESTEFFORT),
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BULK,
 216         ECN_OR_COST(BULK),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_INTERACTIVE,
 220         ECN_OR_COST(INTERACTIVE),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE_BULK,
 224         ECN_OR_COST(INTERACTIVE_BULK),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK)
 227 };
 228
 229
 230 /*
 231  * Route cache.
 232  */
 233
 234 /* The locking scheme is rather straight forward:
 235  *
 236  * 1) Read-Copy Update protects the buckets of the central route hash.
 237  * 2) Only writers remove entries, and they hold the lock
 238  *    as they look at rtable reference counts.
 239  * 3) Only readers acquire references to rtable entries,
 240  *    they do so with atomic increments and with the
 241  *    lock held.
 242  */
 243
 244 struct rt_hash_bucket {
 245         struct rtable __rcu     *chain;
 246 };
 247
 248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 249         defined(CONFIG_PROVE_LOCKING)
 250 /*
 251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 252  * The size of this table is a power of two and depends on the number of CPUS.
 253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 254  */
 255 #ifdef CONFIG_LOCKDEP
 256 # define RT_HASH_LOCK_SZ        256
 257 #else
 258 # if NR_CPUS >= 32
 259 #  define RT_HASH_LOCK_SZ       4096
 260 # elif NR_CPUS >= 16
 261 #  define RT_HASH_LOCK_SZ       2048
 262 # elif NR_CPUS >= 8
 263 #  define RT_HASH_LOCK_SZ       1024
 264 # elif NR_CPUS >= 4
 265 #  define RT_HASH_LOCK_SZ       512
 266 # else
 267 #  define RT_HASH_LOCK_SZ       256
 268 # endif
 269 #endif
 270
 271 static spinlock_t       *rt_hash_locks;
 272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 273
 274 static __init void rt_hash_lock_init(void)
 275 {
 276         int i;
 277
 278         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 279                         GFP_KERNEL);
 280         if (!rt_hash_locks)
 281                 panic("IP: failed to allocate rt_hash_locks\n");
 282
 283         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 284                 spin_lock_init(&rt_hash_locks[i]);
 285 }
 286 #else
 287 # define rt_hash_lock_addr(slot) NULL
 288
 289 static inline void rt_hash_lock_init(void)
 290 {
 291 }
 292 #endif
 293
 294 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 295 static unsigned                 rt_hash_mask __read_mostly;
 296 static unsigned int             rt_hash_log  __read_mostly;
 297
 298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 300
 301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 302                                    int genid)
 303 {
 304         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 305                             idx, genid)
 306                 & rt_hash_mask;
 307 }
 308
 309 static inline int rt_genid(struct net *net)
 310 {
 311         return atomic_read(&net->ipv4.rt_genid);
 312 }
 313
 314 #ifdef CONFIG_PROC_FS
 315 struct rt_cache_iter_state {
 316         struct seq_net_private p;
 317         int bucket;
 318         int genid;
 319 };
 320
 321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         struct rtable *r = NULL;
 325
 326         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 327                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 328                         continue;
 329                 rcu_read_lock_bh();
 330                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 331                 while (r) {
 332                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 333                             r->rt_genid == st->genid)
 334                                 return r;
 335                         r = rcu_dereference_bh(r->dst.rt_next);
 336                 }
 337                 rcu_read_unlock_bh();
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 343                                           struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346
 347         r = rcu_dereference_bh(r->dst.rt_next);
 348         while (!r) {
 349                 rcu_read_unlock_bh();
 350                 do {
 351                         if (--st->bucket < 0)
 352                                 return NULL;
 353                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 354                 rcu_read_lock_bh();
 355                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 356         }
 357         return r;
 358 }
 359
 360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 361                                         struct rtable *r)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 365                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 366                         continue;
 367                 if (r->rt_genid == st->genid)
 368                         break;
 369         }
 370         return r;
 371 }
 372
 373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 374 {
 375         struct rtable *r = rt_cache_get_first(seq);
 376
 377         if (r)
 378                 while (pos && (r = rt_cache_get_next(seq, r)))
 379                         --pos;
 380         return pos ? NULL : r;
 381 }
 382
 383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 384 {
 385         struct rt_cache_iter_state *st = seq->private;
 386         if (*pos)
 387                 return rt_cache_get_idx(seq, *pos - 1);
 388         st->genid = rt_genid(seq_file_net(seq));
 389         return SEQ_START_TOKEN;
 390 }
 391
 392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 393 {
 394         struct rtable *r;
 395
 396         if (v == SEQ_START_TOKEN)
 397                 r = rt_cache_get_first(seq);
 398         else
 399                 r = rt_cache_get_next(seq, v);
 400         ++*pos;
 401         return r;
 402 }
 403
 404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 405 {
 406         if (v && v != SEQ_START_TOKEN)
 407                 rcu_read_unlock_bh();
 408 }
 409
 410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 411 {
 412         if (v == SEQ_START_TOKEN)
 413                 seq_printf(seq, "%-127s\n",
 414                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 415                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 416                            "HHUptod\tSpecDst");
 417         else {
 418                 struct rtable *r = v;
 419                 struct neighbour *n;
 420                 int len;
 421
 422                 n = dst_get_neighbour(&r->dst);
 423                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 424                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 425                         r->dst.dev ? r->dst.dev->name : "*",
 426                         (__force u32)r->rt_dst,
 427                         (__force u32)r->rt_gateway,
 428                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 429                         r->dst.__use, 0, (__force u32)r->rt_src,
 430                         dst_metric_advmss(&r->dst) + 40,
 431                         dst_metric(&r->dst, RTAX_WINDOW),
 432                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 433                               dst_metric(&r->dst, RTAX_RTTVAR)),
 434                         r->rt_key_tos,
 435                         -1,
 436                         (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
 437                         r->rt_spec_dst, &len);
 438
 439                 seq_printf(seq, "%*s\n", 127 - len, "");
 440         }
 441         return 0;
 442 }
 443
 444 static const struct seq_operations rt_cache_seq_ops = {
 445         .start  = rt_cache_seq_start,
 446         .next   = rt_cache_seq_next,
 447         .stop   = rt_cache_seq_stop,
 448         .show   = rt_cache_seq_show,
 449 };
 450
 451 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 452 {
 453         return seq_open_net(inode, file, &rt_cache_seq_ops,
 454                         sizeof(struct rt_cache_iter_state));
 455 }
 456
 457 static const struct file_operations rt_cache_seq_fops = {
 458         .owner   = THIS_MODULE,
 459         .open    = rt_cache_seq_open,
 460         .read    = seq_read,
 461         .llseek  = seq_lseek,
 462         .release = seq_release_net,
 463 };
 464
 465
 466 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 467 {
 468         int cpu;
 469
 470         if (*pos == 0)
 471                 return SEQ_START_TOKEN;
 472
 473         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 474                 if (!cpu_possible(cpu))
 475                         continue;
 476                 *pos = cpu+1;
 477                 return &per_cpu(rt_cache_stat, cpu);
 478         }
 479         return NULL;
 480 }
 481
 482 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 483 {
 484         int cpu;
 485
 486         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 487                 if (!cpu_possible(cpu))
 488                         continue;
 489                 *pos = cpu+1;
 490                 return &per_cpu(rt_cache_stat, cpu);
 491         }
 492         return NULL;
 493
 494 }
 495
 496 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 497 {
 498
 499 }
 500
 501 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 502 {
 503         struct rt_cache_stat *st = v;
 504
 505         if (v == SEQ_START_TOKEN) {
 506                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 507                 return 0;
 508         }
 509
 510         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 511                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 512                    dst_entries_get_slow(&ipv4_dst_ops),
 513                    st->in_hit,
 514                    st->in_slow_tot,
 515                    st->in_slow_mc,
 516                    st->in_no_route,
 517                    st->in_brd,
 518                    st->in_martian_dst,
 519                    st->in_martian_src,
 520
 521                    st->out_hit,
 522                    st->out_slow_tot,
 523                    st->out_slow_mc,
 524
 525                    st->gc_total,
 526                    st->gc_ignored,
 527                    st->gc_goal_miss,
 528                    st->gc_dst_overflow,
 529                    st->in_hlist_search,
 530                    st->out_hlist_search
 531                 );
 532         return 0;
 533 }
 534
 535 static const struct seq_operations rt_cpu_seq_ops = {
 536         .start  = rt_cpu_seq_start,
 537         .next   = rt_cpu_seq_next,
 538         .stop   = rt_cpu_seq_stop,
 539         .show   = rt_cpu_seq_show,
 540 };
 541
 542
 543 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 544 {
 545         return seq_open(file, &rt_cpu_seq_ops);
 546 }
 547
 548 static const struct file_operations rt_cpu_seq_fops = {
 549         .owner   = THIS_MODULE,
 550         .open    = rt_cpu_seq_open,
 551         .read    = seq_read,
 552         .llseek  = seq_lseek,
 553         .release = seq_release,
 554 };
 555
 556 #ifdef CONFIG_IP_ROUTE_CLASSID
 557 static int rt_acct_proc_show(struct seq_file *m, void *v)
 558 {
 559         struct ip_rt_acct *dst, *src;
 560         unsigned int i, j;
 561
 562         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 563         if (!dst)
 564                 return -ENOMEM;
 565
 566         for_each_possible_cpu(i) {
 567                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 568                 for (j = 0; j < 256; j++) {
 569                         dst[j].o_bytes   += src[j].o_bytes;
 570                         dst[j].o_packets += src[j].o_packets;
 571                         dst[j].i_bytes   += src[j].i_bytes;
 572                         dst[j].i_packets += src[j].i_packets;
 573                 }
 574         }
 575
 576         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 577         kfree(dst);
 578         return 0;
 579 }
 580
 581 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 582 {
 583         return single_open(file, rt_acct_proc_show, NULL);
 584 }
 585
 586 static const struct file_operations rt_acct_proc_fops = {
 587         .owner          = THIS_MODULE,
 588         .open           = rt_acct_proc_open,
 589         .read           = seq_read,
 590         .llseek         = seq_lseek,
 591         .release        = single_release,
 592 };
 593 #endif
 594
 595 static int __net_init ip_rt_do_proc_init(struct net *net)
 596 {
 597         struct proc_dir_entry *pde;
 598
 599         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 600                         &rt_cache_seq_fops);
 601         if (!pde)
 602                 goto err1;
 603
 604         pde = proc_create("rt_cache", S_IRUGO,
 605                           net->proc_net_stat, &rt_cpu_seq_fops);
 606         if (!pde)
 607                 goto err2;
 608
 609 #ifdef CONFIG_IP_ROUTE_CLASSID
 610         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 611         if (!pde)
 612                 goto err3;
 613 #endif
 614         return 0;
 615
 616 #ifdef CONFIG_IP_ROUTE_CLASSID
 617 err3:
 618         remove_proc_entry("rt_cache", net->proc_net_stat);
 619 #endif
 620 err2:
 621         remove_proc_entry("rt_cache", net->proc_net);
 622 err1:
 623         return -ENOMEM;
 624 }
 625
 626 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 627 {
 628         remove_proc_entry("rt_cache", net->proc_net_stat);
 629         remove_proc_entry("rt_cache", net->proc_net);
 630 #ifdef CONFIG_IP_ROUTE_CLASSID
 631         remove_proc_entry("rt_acct", net->proc_net);
 632 #endif
 633 }
 634
 635 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 636         .init = ip_rt_do_proc_init,
 637         .exit = ip_rt_do_proc_exit,
 638 };
 639
 640 static int __init ip_rt_proc_init(void)
 641 {
 642         return register_pernet_subsys(&ip_rt_proc_ops);
 643 }
 644
 645 #else
 646 static inline int ip_rt_proc_init(void)
 647 {
 648         return 0;
 649 }
 650 #endif /* CONFIG_PROC_FS */
 651
 652 static inline void rt_free(struct rtable *rt)
 653 {
 654         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 655 }
 656
 657 static inline void rt_drop(struct rtable *rt)
 658 {
 659         ip_rt_put(rt);
 660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 661 }
 662
 663 static inline int rt_fast_clean(struct rtable *rth)
 664 {
 665         /* Kill broadcast/multicast entries very aggresively, if they
 666            collide in hash table with more useful entries */
 667         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 668                 rt_is_input_route(rth) && rth->dst.rt_next;
 669 }
 670
 671 static inline int rt_valuable(struct rtable *rth)
 672 {
 673         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 674                 (rth->peer && rth->peer->pmtu_expires);
 675 }
 676
 677 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 678 {
 679         unsigned long age;
 680         int ret = 0;
 681
 682         if (atomic_read(&rth->dst.__refcnt))
 683                 goto out;
 684
 685         age = jiffies - rth->dst.lastuse;
 686         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 687             (age <= tmo2 && rt_valuable(rth)))
 688                 goto out;
 689         ret = 1;
 690 out:    return ret;
 691 }
 692
 693 /* Bits of score are:
 694  * 31: very valuable
 695  * 30: not quite useless
 696  * 29..0: usage counter
 697  */
 698 static inline u32 rt_score(struct rtable *rt)
 699 {
 700         u32 score = jiffies - rt->dst.lastuse;
 701
 702         score = ~score & ~(3<<30);
 703
 704         if (rt_valuable(rt))
 705                 score |= (1<<31);
 706
 707         if (rt_is_output_route(rt) ||
 708             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 709                 score |= (1<<30);
 710
 711         return score;
 712 }
 713
 714 static inline bool rt_caching(const struct net *net)
 715 {
 716         return net->ipv4.current_rt_cache_rebuild_count <=
 717                 net->ipv4.sysctl_rt_cache_rebuild_count;
 718 }
 719
 720 static inline bool compare_hash_inputs(const struct rtable *rt1,
 721                                        const struct rtable *rt2)
 722 {
 723         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 724                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 725                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 726 }
 727
 728 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 729 {
 730         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 731                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 732                 (rt1->rt_mark ^ rt2->rt_mark) |
 733                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 734                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 735                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 736 }
 737
 738 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 739 {
 740         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 741 }
 742
 743 static inline int rt_is_expired(struct rtable *rth)
 744 {
 745         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 746 }
 747
 748 /*
 749  * Perform a full scan of hash table and free all entries.
 750  * Can be called by a softirq or a process.
 751  * In the later case, we want to be reschedule if necessary
 752  */
 753 static void rt_do_flush(struct net *net, int process_context)
 754 {
 755         unsigned int i;
 756         struct rtable *rth, *next;
 757
 758         for (i = 0; i <= rt_hash_mask; i++) {
 759                 struct rtable __rcu **pprev;
 760                 struct rtable *list;
 761
 762                 if (process_context && need_resched())
 763                         cond_resched();
 764                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 765                 if (!rth)
 766                         continue;
 767
 768                 spin_lock_bh(rt_hash_lock_addr(i));
 769
 770                 list = NULL;
 771                 pprev = &rt_hash_table[i].chain;
 772                 rth = rcu_dereference_protected(*pprev,
 773                         lockdep_is_held(rt_hash_lock_addr(i)));
 774
 775                 while (rth) {
 776                         next = rcu_dereference_protected(rth->dst.rt_next,
 777                                 lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779                         if (!net ||
 780                             net_eq(dev_net(rth->dst.dev), net)) {
 781                                 rcu_assign_pointer(*pprev, next);
 782                                 rcu_assign_pointer(rth->dst.rt_next, list);
 783                                 list = rth;
 784                         } else {
 785                                 pprev = &rth->dst.rt_next;
 786                         }
 787                         rth = next;
 788                 }
 789
 790                 spin_unlock_bh(rt_hash_lock_addr(i));
 791
 792                 for (; list; list = next) {
 793                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 794                         rt_free(list);
 795                 }
 796         }
 797 }
 798
 799 /*
 800  * While freeing expired entries, we compute average chain length
 801  * and standard deviation, using fixed-point arithmetic.
 802  * This to have an estimation of rt_chain_length_max
 803  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 804  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 805  */
 806
 807 #define FRACT_BITS 3
 808 #define ONE (1UL << FRACT_BITS)
 809
 810 /*
 811  * Given a hash chain and an item in this hash chain,
 812  * find if a previous entry has the same hash_inputs
 813  * (but differs on tos, mark or oif)
 814  * Returns 0 if an alias is found.
 815  * Returns ONE if rth has no alias before itself.
 816  */
 817 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 818 {
 819         const struct rtable *aux = head;
 820
 821         while (aux != rth) {
 822                 if (compare_hash_inputs(aux, rth))
 823                         return 0;
 824                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 825         }
 826         return ONE;
 827 }
 828
 829 /*
 830  * Perturbation of rt_genid by a small quantity [1..256]
 831  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 832  * many times (2^24) without giving recent rt_genid.
 833  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 834  */
 835 static void rt_cache_invalidate(struct net *net)
 836 {
 837         unsigned char shuffle;
 838
 839         get_random_bytes(&shuffle, sizeof(shuffle));
 840         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 841         redirect_genid++;
 842 }
 843
 844 /*
 845  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 846  * delay >= 0 : invalidate & flush cache (can be long)
 847  */
 848 void rt_cache_flush(struct net *net, int delay)
 849 {
 850         rt_cache_invalidate(net);
 851         if (delay >= 0)
 852                 rt_do_flush(net, !in_softirq());
 853 }
 854
 855 /* Flush previous cache invalidated entries from the cache */
 856 void rt_cache_flush_batch(struct net *net)
 857 {
 858         rt_do_flush(net, !in_softirq());
 859 }
 860
 861 static void rt_emergency_hash_rebuild(struct net *net)
 862 {
 863         if (net_ratelimit())
 864                 printk(KERN_WARNING "Route hash chain too long!\n");
 865         rt_cache_invalidate(net);
 866 }
 867
 868 /*
 869    Short description of GC goals.
 870
 871    We want to build algorithm, which will keep routing cache
 872    at some equilibrium point, when number of aged off entries
 873    is kept approximately equal to newly generated ones.
 874
 875    Current expiration strength is variable "expire".
 876    We try to adjust it dynamically, so that if networking
 877    is idle expires is large enough to keep enough of warm entries,
 878    and when load increases it reduces to limit cache size.
 879  */
 880
 881 static int rt_garbage_collect(struct dst_ops *ops)
 882 {
 883         static unsigned long expire = RT_GC_TIMEOUT;
 884         static unsigned long last_gc;
 885         static int rover;
 886         static int equilibrium;
 887         struct rtable *rth;
 888         struct rtable __rcu **rthp;
 889         unsigned long now = jiffies;
 890         int goal;
 891         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 892
 893         /*
 894          * Garbage collection is pretty expensive,
 895          * do not make it too frequently.
 896          */
 897
 898         RT_CACHE_STAT_INC(gc_total);
 899
 900         if (now - last_gc < ip_rt_gc_min_interval &&
 901             entries < ip_rt_max_size) {
 902                 RT_CACHE_STAT_INC(gc_ignored);
 903                 goto out;
 904         }
 905
 906         entries = dst_entries_get_slow(&ipv4_dst_ops);
 907         /* Calculate number of entries, which we want to expire now. */
 908         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 909         if (goal <= 0) {
 910                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 911                         equilibrium = ipv4_dst_ops.gc_thresh;
 912                 goal = entries - equilibrium;
 913                 if (goal > 0) {
 914                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 915                         goal = entries - equilibrium;
 916                 }
 917         } else {
 918                 /* We are in dangerous area. Try to reduce cache really
 919                  * aggressively.
 920                  */
 921                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 922                 equilibrium = entries - goal;
 923         }
 924
 925         if (now - last_gc >= ip_rt_gc_min_interval)
 926                 last_gc = now;
 927
 928         if (goal <= 0) {
 929                 equilibrium += goal;
 930                 goto work_done;
 931         }
 932
 933         do {
 934                 int i, k;
 935
 936                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 937                         unsigned long tmo = expire;
 938
 939                         k = (k + 1) & rt_hash_mask;
 940                         rthp = &rt_hash_table[k].chain;
 941                         spin_lock_bh(rt_hash_lock_addr(k));
 942                         while ((rth = rcu_dereference_protected(*rthp,
 943                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 944                                 if (!rt_is_expired(rth) &&
 945                                         !rt_may_expire(rth, tmo, expire)) {
 946                                         tmo >>= 1;
 947                                         rthp = &rth->dst.rt_next;
 948                                         continue;
 949                                 }
 950                                 *rthp = rth->dst.rt_next;
 951                                 rt_free(rth);
 952                                 goal--;
 953                         }
 954                         spin_unlock_bh(rt_hash_lock_addr(k));
 955                         if (goal <= 0)
 956                                 break;
 957                 }
 958                 rover = k;
 959
 960                 if (goal <= 0)
 961                         goto work_done;
 962
 963                 /* Goal is not achieved. We stop process if:
 964
 965                    - if expire reduced to zero. Otherwise, expire is halfed.
 966                    - if table is not full.
 967                    - if we are called from interrupt.
 968                    - jiffies check is just fallback/debug loop breaker.
 969                      We will not spin here for long time in any case.
 970                  */
 971
 972                 RT_CACHE_STAT_INC(gc_goal_miss);
 973
 974                 if (expire == 0)
 975                         break;
 976
 977                 expire >>= 1;
 978
 979                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 980                         goto out;
 981         } while (!in_softirq() && time_before_eq(jiffies, now));
 982
 983         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 984                 goto out;
 985         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 986                 goto out;
 987         if (net_ratelimit())
 988                 printk(KERN_WARNING "dst cache overflow\n");
 989         RT_CACHE_STAT_INC(gc_dst_overflow);
 990         return 1;
 991
 992 work_done:
 993         expire += ip_rt_gc_min_interval;
 994         if (expire > ip_rt_gc_timeout ||
 995             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
 996             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 997                 expire = ip_rt_gc_timeout;
 998 out:    return 0;
 999 }
1000
1001 /*
1002  * Returns number of entries in a hash chain that have different hash_inputs
1003  */
1004 static int slow_chain_length(const struct rtable *head)
1005 {
1006         int length = 0;
1007         const struct rtable *rth = head;
1008
1009         while (rth) {
1010                 length += has_noalias(head, rth);
1011                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1012         }
1013         return length >> FRACT_BITS;
1014 }
1015
1016 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1017 {
1018         struct neigh_table *tbl = &arp_tbl;
1019         static const __be32 inaddr_any = 0;
1020         struct net_device *dev = dst->dev;
1021         const __be32 *pkey = daddr;
1022         struct neighbour *n;
1023
1024 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1025         if (dev->type == ARPHRD_ATM)
1026                 tbl = clip_tbl_hook;
1027 #endif
1028         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1029                 pkey = &inaddr_any;
1030
1031         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1032         if (n)
1033                 return n;
1034         return neigh_create(tbl, pkey, dev);
1035 }
1036
1037 static int rt_bind_neighbour(struct rtable *rt)
1038 {
1039         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1040         if (IS_ERR(n))
1041                 return PTR_ERR(n);
1042         dst_set_neighbour(&rt->dst, n);
1043
1044         return 0;
1045 }
1046
1047 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1048                                      struct sk_buff *skb, int ifindex)
1049 {
1050         struct rtable   *rth, *cand;
1051         struct rtable __rcu **rthp, **candp;
1052         unsigned long   now;
1053         u32             min_score;
1054         int             chain_length;
1055         int attempts = !in_softirq();
1056
1057 restart:
1058         chain_length = 0;
1059         min_score = ~(u32)0;
1060         cand = NULL;
1061         candp = NULL;
1062         now = jiffies;
1063
1064         if (!rt_caching(dev_net(rt->dst.dev))) {
1065                 /*
1066                  * If we're not caching, just tell the caller we
1067                  * were successful and don't touch the route.  The
1068                  * caller hold the sole reference to the cache entry, and
1069                  * it will be released when the caller is done with it.
1070                  * If we drop it here, the callers have no way to resolve routes
1071                  * when we're not caching.  Instead, just point *rp at rt, so
1072                  * the caller gets a single use out of the route
1073                  * Note that we do rt_free on this new route entry, so that
1074                  * once its refcount hits zero, we are still able to reap it
1075                  * (Thanks Alexey)
1076                  * Note: To avoid expensive rcu stuff for this uncached dst,
1077                  * we set DST_NOCACHE so that dst_release() can free dst without
1078                  * waiting a grace period.
1079                  */
1080
1081                 rt->dst.flags |= DST_NOCACHE;
1082                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1083                         int err = rt_bind_neighbour(rt);
1084                         if (err) {
1085                                 if (net_ratelimit())
1086                                         printk(KERN_WARNING
1087                                             "Neighbour table failure & not caching routes.\n");
1088                                 ip_rt_put(rt);
1089                                 return ERR_PTR(err);
1090                         }
1091                 }
1092
1093                 goto skip_hashing;
1094         }
1095
1096         rthp = &rt_hash_table[hash].chain;
1097
1098         spin_lock_bh(rt_hash_lock_addr(hash));
1099         while ((rth = rcu_dereference_protected(*rthp,
1100                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1101                 if (rt_is_expired(rth)) {
1102                         *rthp = rth->dst.rt_next;
1103                         rt_free(rth);
1104                         continue;
1105                 }
1106                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1107                         /* Put it first */
1108                         *rthp = rth->dst.rt_next;
1109                         /*
1110                          * Since lookup is lockfree, the deletion
1111                          * must be visible to another weakly ordered CPU before
1112                          * the insertion at the start of the hash chain.
1113                          */
1114                         rcu_assign_pointer(rth->dst.rt_next,
1115                                            rt_hash_table[hash].chain);
1116                         /*
1117                          * Since lookup is lockfree, the update writes
1118                          * must be ordered for consistency on SMP.
1119                          */
1120                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1121
1122                         dst_use(&rth->dst, now);
1123                         spin_unlock_bh(rt_hash_lock_addr(hash));
1124
1125                         rt_drop(rt);
1126                         if (skb)
1127                                 skb_dst_set(skb, &rth->dst);
1128                         return rth;
1129                 }
1130
1131                 if (!atomic_read(&rth->dst.__refcnt)) {
1132                         u32 score = rt_score(rth);
1133
1134                         if (score <= min_score) {
1135                                 cand = rth;
1136                                 candp = rthp;
1137                                 min_score = score;
1138                         }
1139                 }
1140
1141                 chain_length++;
1142
1143                 rthp = &rth->dst.rt_next;
1144         }
1145
1146         if (cand) {
1147                 /* ip_rt_gc_elasticity used to be average length of chain
1148                  * length, when exceeded gc becomes really aggressive.
1149                  *
1150                  * The second limit is less certain. At the moment it allows
1151                  * only 2 entries per bucket. We will see.
1152                  */
1153                 if (chain_length > ip_rt_gc_elasticity) {
1154                         *candp = cand->dst.rt_next;
1155                         rt_free(cand);
1156                 }
1157         } else {
1158                 if (chain_length > rt_chain_length_max &&
1159                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1160                         struct net *net = dev_net(rt->dst.dev);
1161                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1162                         if (!rt_caching(net)) {
1163                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1164                                         rt->dst.dev->name, num);
1165                         }
1166                         rt_emergency_hash_rebuild(net);
1167                         spin_unlock_bh(rt_hash_lock_addr(hash));
1168
1169                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1170                                         ifindex, rt_genid(net));
1171                         goto restart;
1172                 }
1173         }
1174
1175         /* Try to bind route to arp only if it is output
1176            route or unicast forwarding path.
1177          */
1178         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1179                 int err = rt_bind_neighbour(rt);
1180                 if (err) {
1181                         spin_unlock_bh(rt_hash_lock_addr(hash));
1182
1183                         if (err != -ENOBUFS) {
1184                                 rt_drop(rt);
1185                                 return ERR_PTR(err);
1186                         }
1187
1188                         /* Neighbour tables are full and nothing
1189                            can be released. Try to shrink route cache,
1190                            it is most likely it holds some neighbour records.
1191                          */
1192                         if (attempts-- > 0) {
1193                                 int saved_elasticity = ip_rt_gc_elasticity;
1194                                 int saved_int = ip_rt_gc_min_interval;
1195                                 ip_rt_gc_elasticity     = 1;
1196                                 ip_rt_gc_min_interval   = 0;
1197                                 rt_garbage_collect(&ipv4_dst_ops);
1198                                 ip_rt_gc_min_interval   = saved_int;
1199                                 ip_rt_gc_elasticity     = saved_elasticity;
1200                                 goto restart;
1201                         }
1202
1203                         if (net_ratelimit())
1204                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1205                         rt_drop(rt);
1206                         return ERR_PTR(-ENOBUFS);
1207                 }
1208         }
1209
1210         rt->dst.rt_next = rt_hash_table[hash].chain;
1211
1212         /*
1213          * Since lookup is lockfree, we must make sure
1214          * previous writes to rt are committed to memory
1215          * before making rt visible to other CPUS.
1216          */
1217         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1218
1219         spin_unlock_bh(rt_hash_lock_addr(hash));
1220
1221 skip_hashing:
1222         if (skb)
1223                 skb_dst_set(skb, &rt->dst);
1224         return rt;
1225 }
1226
1227 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1228
1229 static u32 rt_peer_genid(void)
1230 {
1231         return atomic_read(&__rt_peer_genid);
1232 }
1233
1234 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1235 {
1236         struct inet_peer *peer;
1237
1238         peer = inet_getpeer_v4(daddr, create);
1239
1240         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1241                 inet_putpeer(peer);
1242         else
1243                 rt->rt_peer_genid = rt_peer_genid();
1244 }
1245
1246 /*
1247  * Peer allocation may fail only in serious out-of-memory conditions.  However
1248  * we still can generate some output.
1249  * Random ID selection looks a bit dangerous because we have no chances to
1250  * select ID being unique in a reasonable period of time.
1251  * But broken packet identifier may be better than no packet at all.
1252  */
1253 static void ip_select_fb_ident(struct iphdr *iph)
1254 {
1255         static DEFINE_SPINLOCK(ip_fb_id_lock);
1256         static u32 ip_fallback_id;
1257         u32 salt;
1258
1259         spin_lock_bh(&ip_fb_id_lock);
1260         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1261         iph->id = htons(salt & 0xFFFF);
1262         ip_fallback_id = salt;
1263         spin_unlock_bh(&ip_fb_id_lock);
1264 }
1265
1266 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1267 {
1268         struct rtable *rt = (struct rtable *) dst;
1269
1270         if (rt) {
1271                 if (rt->peer == NULL)
1272                         rt_bind_peer(rt, rt->rt_dst, 1);
1273
1274                 /* If peer is attached to destination, it is never detached,
1275                    so that we need not to grab a lock to dereference it.
1276                  */
1277                 if (rt->peer) {
1278                         iph->id = htons(inet_getid(rt->peer, more));
1279                         return;
1280                 }
1281         } else
1282                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1283                        __builtin_return_address(0));
1284
1285         ip_select_fb_ident(iph);
1286 }
1287 EXPORT_SYMBOL(__ip_select_ident);
1288
1289 static void rt_del(unsigned hash, struct rtable *rt)
1290 {
1291         struct rtable __rcu **rthp;
1292         struct rtable *aux;
1293
1294         rthp = &rt_hash_table[hash].chain;
1295         spin_lock_bh(rt_hash_lock_addr(hash));
1296         ip_rt_put(rt);
1297         while ((aux = rcu_dereference_protected(*rthp,
1298                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1299                 if (aux == rt || rt_is_expired(aux)) {
1300                         *rthp = aux->dst.rt_next;
1301                         rt_free(aux);
1302                         continue;
1303                 }
1304                 rthp = &aux->dst.rt_next;
1305         }
1306         spin_unlock_bh(rt_hash_lock_addr(hash));
1307 }
1308
1309 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1310 {
1311         struct rtable *rt = (struct rtable *) dst;
1312         __be32 orig_gw = rt->rt_gateway;
1313         struct neighbour *n, *old_n;
1314
1315         dst_confirm(&rt->dst);
1316
1317         rt->rt_gateway = peer->redirect_learned.a4;
1318
1319         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1320         if (IS_ERR(n))
1321                 return PTR_ERR(n);
1322         old_n = xchg(&rt->dst._neighbour, n);
1323         if (old_n)
1324                 neigh_release(old_n);
1325         if (!n || !(n->nud_state & NUD_VALID)) {
1326                 if (n)
1327                         neigh_event_send(n, NULL);
1328                 rt->rt_gateway = orig_gw;
1329                 return -EAGAIN;
1330         } else {
1331                 rt->rt_flags |= RTCF_REDIRECTED;
1332                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1333         }
1334         return 0;
1335 }
1336
1337 /* called in rcu_read_lock() section */
1338 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1339                     __be32 saddr, struct net_device *dev)
1340 {
1341         int s, i;
1342         struct in_device *in_dev = __in_dev_get_rcu(dev);
1343         __be32 skeys[2] = { saddr, 0 };
1344         int    ikeys[2] = { dev->ifindex, 0 };
1345         struct inet_peer *peer;
1346         struct net *net;
1347
1348         if (!in_dev)
1349                 return;
1350
1351         net = dev_net(dev);
1352         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1353             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1354             ipv4_is_zeronet(new_gw))
1355                 goto reject_redirect;
1356
1357         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1358                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1359                         goto reject_redirect;
1360                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1361                         goto reject_redirect;
1362         } else {
1363                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1364                         goto reject_redirect;
1365         }
1366
1367         for (s = 0; s < 2; s++) {
1368                 for (i = 0; i < 2; i++) {
1369                         unsigned int hash;
1370                         struct rtable __rcu **rthp;
1371                         struct rtable *rt;
1372
1373                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1374
1375                         rthp = &rt_hash_table[hash].chain;
1376
1377                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1378                                 rthp = &rt->dst.rt_next;
1379
1380                                 if (rt->rt_key_dst != daddr ||
1381                                     rt->rt_key_src != skeys[s] ||
1382                                     rt->rt_oif != ikeys[i] ||
1383                                     rt_is_input_route(rt) ||
1384                                     rt_is_expired(rt) ||
1385                                     !net_eq(dev_net(rt->dst.dev), net) ||
1386                                     rt->dst.error ||
1387                                     rt->dst.dev != dev ||
1388                                     rt->rt_gateway != old_gw)
1389                                         continue;
1390
1391                                 if (!rt->peer)
1392                                         rt_bind_peer(rt, rt->rt_dst, 1);
1393
1394                                 peer = rt->peer;
1395                                 if (peer) {
1396                                         if (peer->redirect_learned.a4 != new_gw ||
1397                                             peer->redirect_genid != redirect_genid) {
1398                                                 peer->redirect_learned.a4 = new_gw;
1399                                                 peer->redirect_genid = redirect_genid;
1400                                                 atomic_inc(&__rt_peer_genid);
1401                                         }
1402                                         check_peer_redir(&rt->dst, peer);
1403                                 }
1404                         }
1405                 }
1406         }
1407         return;
1408
1409 reject_redirect:
1410 #ifdef CONFIG_IP_ROUTE_VERBOSE
1411         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1412                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1413                         "  Advised path = %pI4 -> %pI4\n",
1414                        &old_gw, dev->name, &new_gw,
1415                        &saddr, &daddr);
1416 #endif
1417         ;
1418 }
1419
1420 static bool peer_pmtu_expired(struct inet_peer *peer)
1421 {
1422         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1423
1424         return orig &&
1425                time_after_eq(jiffies, orig) &&
1426                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1427 }
1428
1429 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1430 {
1431         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1432
1433         return orig &&
1434                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1435 }
1436
1437 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1438 {
1439         struct rtable *rt = (struct rtable *)dst;
1440         struct dst_entry *ret = dst;
1441
1442         if (rt) {
1443                 if (dst->obsolete > 0) {
1444                         ip_rt_put(rt);
1445                         ret = NULL;
1446                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1447                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1448                                                 rt->rt_oif,
1449                                                 rt_genid(dev_net(dst->dev)));
1450                         rt_del(hash, rt);
1451                         ret = NULL;
1452                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1453                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1454                 }
1455         }
1456         return ret;
1457 }
1458
1459 /*
1460  * Algorithm:
1461  *      1. The first ip_rt_redirect_number redirects are sent
1462  *         with exponential backoff, then we stop sending them at all,
1463  *         assuming that the host ignores our redirects.
1464  *      2. If we did not see packets requiring redirects
1465  *         during ip_rt_redirect_silence, we assume that the host
1466  *         forgot redirected route and start to send redirects again.
1467  *
1468  * This algorithm is much cheaper and more intelligent than dumb load limiting
1469  * in icmp.c.
1470  *
1471  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1472  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1473  */
1474
1475 void ip_rt_send_redirect(struct sk_buff *skb)
1476 {
1477         struct rtable *rt = skb_rtable(skb);
1478         struct in_device *in_dev;
1479         struct inet_peer *peer;
1480         int log_martians;
1481
1482         rcu_read_lock();
1483         in_dev = __in_dev_get_rcu(rt->dst.dev);
1484         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1485                 rcu_read_unlock();
1486                 return;
1487         }
1488         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1489         rcu_read_unlock();
1490
1491         if (!rt->peer)
1492                 rt_bind_peer(rt, rt->rt_dst, 1);
1493         peer = rt->peer;
1494         if (!peer) {
1495                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1496                 return;
1497         }
1498
1499         /* No redirected packets during ip_rt_redirect_silence;
1500          * reset the algorithm.
1501          */
1502         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1503                 peer->rate_tokens = 0;
1504
1505         /* Too many ignored redirects; do not send anything
1506          * set dst.rate_last to the last seen redirected packet.
1507          */
1508         if (peer->rate_tokens >= ip_rt_redirect_number) {
1509                 peer->rate_last = jiffies;
1510                 return;
1511         }
1512
1513         /* Check for load limit; set rate_last to the latest sent
1514          * redirect.
1515          */
1516         if (peer->rate_tokens == 0 ||
1517             time_after(jiffies,
1518                        (peer->rate_last +
1519                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1520                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1521                 peer->rate_last = jiffies;
1522                 ++peer->rate_tokens;
1523 #ifdef CONFIG_IP_ROUTE_VERBOSE
1524                 if (log_martians &&
1525                     peer->rate_tokens == ip_rt_redirect_number &&
1526                     net_ratelimit())
1527                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1528                                &ip_hdr(skb)->saddr, rt->rt_iif,
1529                                 &rt->rt_dst, &rt->rt_gateway);
1530 #endif
1531         }
1532 }
1533
1534 static int ip_error(struct sk_buff *skb)
1535 {
1536         struct rtable *rt = skb_rtable(skb);
1537         struct inet_peer *peer;
1538         unsigned long now;
1539         bool send;
1540         int code;
1541
1542         switch (rt->dst.error) {
1543         case EINVAL:
1544         default:
1545                 goto out;
1546         case EHOSTUNREACH:
1547                 code = ICMP_HOST_UNREACH;
1548                 break;
1549         case ENETUNREACH:
1550                 code = ICMP_NET_UNREACH;
1551                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1552                                 IPSTATS_MIB_INNOROUTES);
1553                 break;
1554         case EACCES:
1555                 code = ICMP_PKT_FILTERED;
1556                 break;
1557         }
1558
1559         if (!rt->peer)
1560                 rt_bind_peer(rt, rt->rt_dst, 1);
1561         peer = rt->peer;
1562
1563         send = true;
1564         if (peer) {
1565                 now = jiffies;
1566                 peer->rate_tokens += now - peer->rate_last;
1567                 if (peer->rate_tokens > ip_rt_error_burst)
1568                         peer->rate_tokens = ip_rt_error_burst;
1569                 peer->rate_last = now;
1570                 if (peer->rate_tokens >= ip_rt_error_cost)
1571                         peer->rate_tokens -= ip_rt_error_cost;
1572                 else
1573                         send = false;
1574         }
1575         if (send)
1576                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1577
1578 out:    kfree_skb(skb);
1579         return 0;
1580 }
1581
1582 /*
1583  *      The last two values are not from the RFC but
1584  *      are needed for AMPRnet AX.25 paths.
1585  */
1586
1587 static const unsigned short mtu_plateau[] =
1588 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1589
1590 static inline unsigned short guess_mtu(unsigned short old_mtu)
1591 {
1592         int i;
1593
1594         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1595                 if (old_mtu > mtu_plateau[i])
1596                         return mtu_plateau[i];
1597         return 68;
1598 }
1599
1600 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1601                                  unsigned short new_mtu,
1602                                  struct net_device *dev)
1603 {
1604         unsigned short old_mtu = ntohs(iph->tot_len);
1605         unsigned short est_mtu = 0;
1606         struct inet_peer *peer;
1607
1608         peer = inet_getpeer_v4(iph->daddr, 1);
1609         if (peer) {
1610                 unsigned short mtu = new_mtu;
1611
1612                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1613                         /* BSD 4.2 derived systems incorrectly adjust
1614                          * tot_len by the IP header length, and report
1615                          * a zero MTU in the ICMP message.
1616                          */
1617                         if (mtu == 0 &&
1618                             old_mtu >= 68 + (iph->ihl << 2))
1619                                 old_mtu -= iph->ihl << 2;
1620                         mtu = guess_mtu(old_mtu);
1621                 }
1622
1623                 if (mtu < ip_rt_min_pmtu)
1624                         mtu = ip_rt_min_pmtu;
1625                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1626                         unsigned long pmtu_expires;
1627
1628                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1629                         if (!pmtu_expires)
1630                                 pmtu_expires = 1UL;
1631
1632                         est_mtu = mtu;
1633                         peer->pmtu_learned = mtu;
1634                         peer->pmtu_expires = pmtu_expires;
1635                         atomic_inc(&__rt_peer_genid);
1636                 }
1637
1638                 inet_putpeer(peer);
1639         }
1640         return est_mtu ? : new_mtu;
1641 }
1642
1643 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1644 {
1645         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1646
1647         if (!expires)
1648                 return;
1649         if (time_before(jiffies, expires)) {
1650                 u32 orig_dst_mtu = dst_mtu(dst);
1651                 if (peer->pmtu_learned < orig_dst_mtu) {
1652                         if (!peer->pmtu_orig)
1653                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1654                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1655                 }
1656         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1657                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1658 }
1659
1660 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1661 {
1662         struct rtable *rt = (struct rtable *) dst;
1663         struct inet_peer *peer;
1664
1665         dst_confirm(dst);
1666
1667         if (!rt->peer)
1668                 rt_bind_peer(rt, rt->rt_dst, 1);
1669         peer = rt->peer;
1670         if (peer) {
1671                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1672
1673                 if (mtu < ip_rt_min_pmtu)
1674                         mtu = ip_rt_min_pmtu;
1675                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1676
1677                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1678                         if (!pmtu_expires)
1679                                 pmtu_expires = 1UL;
1680
1681                         peer->pmtu_learned = mtu;
1682                         peer->pmtu_expires = pmtu_expires;
1683
1684                         atomic_inc(&__rt_peer_genid);
1685                         rt->rt_peer_genid = rt_peer_genid();
1686                 }
1687                 check_peer_pmtu(dst, peer);
1688         }
1689 }
1690
1691
1692 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1693 {
1694         struct rtable *rt = (struct rtable *) dst;
1695
1696         if (rt_is_expired(rt))
1697                 return NULL;
1698         if (rt->rt_peer_genid != rt_peer_genid()) {
1699                 struct inet_peer *peer;
1700
1701                 if (!rt->peer)
1702                         rt_bind_peer(rt, rt->rt_dst, 0);
1703
1704                 peer = rt->peer;
1705                 if (peer) {
1706                         check_peer_pmtu(dst, peer);
1707
1708                         if (peer->redirect_genid != redirect_genid)
1709                                 peer->redirect_learned.a4 = 0;
1710                         if (peer->redirect_learned.a4 &&
1711                             peer->redirect_learned.a4 != rt->rt_gateway) {
1712                                 if (check_peer_redir(dst, peer))
1713                                         return NULL;
1714                         }
1715                 }
1716
1717                 rt->rt_peer_genid = rt_peer_genid();
1718         }
1719         return dst;
1720 }
1721
1722 static void ipv4_dst_destroy(struct dst_entry *dst)
1723 {
1724         struct rtable *rt = (struct rtable *) dst;
1725         struct inet_peer *peer = rt->peer;
1726
1727         if (rt->fi) {
1728                 fib_info_put(rt->fi);
1729                 rt->fi = NULL;
1730         }
1731         if (peer) {
1732                 rt->peer = NULL;
1733                 inet_putpeer(peer);
1734         }
1735 }
1736
1737
1738 static void ipv4_link_failure(struct sk_buff *skb)
1739 {
1740         struct rtable *rt;
1741
1742         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1743
1744         rt = skb_rtable(skb);
1745         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1746                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1747 }
1748
1749 static int ip_rt_bug(struct sk_buff *skb)
1750 {
1751         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1752                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1753                 skb->dev ? skb->dev->name : "?");
1754         kfree_skb(skb);
1755         WARN_ON(1);
1756         return 0;
1757 }
1758
1759 /*
1760    We do not cache source address of outgoing interface,
1761    because it is used only by IP RR, TS and SRR options,
1762    so that it out of fast path.
1763
1764    BTW remember: "addr" is allowed to be not aligned
1765    in IP options!
1766  */
1767
1768 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1769 {
1770         __be32 src;
1771
1772         if (rt_is_output_route(rt))
1773                 src = ip_hdr(skb)->saddr;
1774         else {
1775                 struct fib_result res;
1776                 struct flowi4 fl4;
1777                 struct iphdr *iph;
1778
1779                 iph = ip_hdr(skb);
1780
1781                 memset(&fl4, 0, sizeof(fl4));
1782                 fl4.daddr = iph->daddr;
1783                 fl4.saddr = iph->saddr;
1784                 fl4.flowi4_tos = RT_TOS(iph->tos);
1785                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1786                 fl4.flowi4_iif = skb->dev->ifindex;
1787                 fl4.flowi4_mark = skb->mark;
1788
1789                 rcu_read_lock();
1790                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1791                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1792                 else
1793                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1794                                         RT_SCOPE_UNIVERSE);
1795                 rcu_read_unlock();
1796         }
1797         memcpy(addr, &src, 4);
1798 }
1799
1800 #ifdef CONFIG_IP_ROUTE_CLASSID
1801 static void set_class_tag(struct rtable *rt, u32 tag)
1802 {
1803         if (!(rt->dst.tclassid & 0xFFFF))
1804                 rt->dst.tclassid |= tag & 0xFFFF;
1805         if (!(rt->dst.tclassid & 0xFFFF0000))
1806                 rt->dst.tclassid |= tag & 0xFFFF0000;
1807 }
1808 #endif
1809
1810 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1811 {
1812         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1813
1814         if (advmss == 0) {
1815                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1816                                ip_rt_min_advmss);
1817                 if (advmss > 65535 - 40)
1818                         advmss = 65535 - 40;
1819         }
1820         return advmss;
1821 }
1822
1823 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1824 {
1825         const struct rtable *rt = (const struct rtable *) dst;
1826         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1827
1828         if (mtu && rt_is_output_route(rt))
1829                 return mtu;
1830
1831         mtu = dst->dev->mtu;
1832
1833         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1834
1835                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1836                         mtu = 576;
1837         }
1838
1839         if (mtu > IP_MAX_MTU)
1840                 mtu = IP_MAX_MTU;
1841
1842         return mtu;
1843 }
1844
1845 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1846                             struct fib_info *fi)
1847 {
1848         struct inet_peer *peer;
1849         int create = 0;
1850
1851         /* If a peer entry exists for this destination, we must hook
1852          * it up in order to get at cached metrics.
1853          */
1854         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1855                 create = 1;
1856
1857         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1858         if (peer) {
1859                 rt->rt_peer_genid = rt_peer_genid();
1860                 if (inet_metrics_new(peer))
1861                         memcpy(peer->metrics, fi->fib_metrics,
1862                                sizeof(u32) * RTAX_MAX);
1863                 dst_init_metrics(&rt->dst, peer->metrics, false);
1864
1865                 check_peer_pmtu(&rt->dst, peer);
1866                 if (peer->redirect_genid != redirect_genid)
1867                         peer->redirect_learned.a4 = 0;
1868                 if (peer->redirect_learned.a4 &&
1869                     peer->redirect_learned.a4 != rt->rt_gateway) {
1870                         rt->rt_gateway = peer->redirect_learned.a4;
1871                         rt->rt_flags |= RTCF_REDIRECTED;
1872                 }
1873         } else {
1874                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1875                         rt->fi = fi;
1876                         atomic_inc(&fi->fib_clntref);
1877                 }
1878                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1879         }
1880 }
1881
1882 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1883                            const struct fib_result *res,
1884                            struct fib_info *fi, u16 type, u32 itag)
1885 {
1886         struct dst_entry *dst = &rt->dst;
1887
1888         if (fi) {
1889                 if (FIB_RES_GW(*res) &&
1890                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1891                         rt->rt_gateway = FIB_RES_GW(*res);
1892                 rt_init_metrics(rt, fl4, fi);
1893 #ifdef CONFIG_IP_ROUTE_CLASSID
1894                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1895 #endif
1896         }
1897
1898         if (dst_mtu(dst) > IP_MAX_MTU)
1899                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1900         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1901                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1902
1903 #ifdef CONFIG_IP_ROUTE_CLASSID
1904 #ifdef CONFIG_IP_MULTIPLE_TABLES
1905         set_class_tag(rt, fib_rules_tclass(res));
1906 #endif
1907         set_class_tag(rt, itag);
1908 #endif
1909 }
1910
1911 static struct rtable *rt_dst_alloc(struct net_device *dev,
1912                                    bool nopolicy, bool noxfrm)
1913 {
1914         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1915                          DST_HOST |
1916                          (nopolicy ? DST_NOPOLICY : 0) |
1917                          (noxfrm ? DST_NOXFRM : 0));
1918 }
1919
1920 /* called in rcu_read_lock() section */
1921 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1922                                 u8 tos, struct net_device *dev, int our)
1923 {
1924         unsigned int hash;
1925         struct rtable *rth;
1926         __be32 spec_dst;
1927         struct in_device *in_dev = __in_dev_get_rcu(dev);
1928         u32 itag = 0;
1929         int err;
1930
1931         /* Primary sanity checks. */
1932
1933         if (in_dev == NULL)
1934                 return -EINVAL;
1935
1936         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1937             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1938                 goto e_inval;
1939
1940         if (ipv4_is_zeronet(saddr)) {
1941                 if (!ipv4_is_local_multicast(daddr))
1942                         goto e_inval;
1943                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1944         } else {
1945                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1946                                           &itag);
1947                 if (err < 0)
1948                         goto e_err;
1949         }
1950         rth = rt_dst_alloc(init_net.loopback_dev,
1951                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1952         if (!rth)
1953                 goto e_nobufs;
1954
1955 #ifdef CONFIG_IP_ROUTE_CLASSID
1956         rth->dst.tclassid = itag;
1957 #endif
1958         rth->dst.output = ip_rt_bug;
1959
1960         rth->rt_key_dst = daddr;
1961         rth->rt_key_src = saddr;
1962         rth->rt_genid   = rt_genid(dev_net(dev));
1963         rth->rt_flags   = RTCF_MULTICAST;
1964         rth->rt_type    = RTN_MULTICAST;
1965         rth->rt_key_tos = tos;
1966         rth->rt_dst     = daddr;
1967         rth->rt_src     = saddr;
1968         rth->rt_route_iif = dev->ifindex;
1969         rth->rt_iif     = dev->ifindex;
1970         rth->rt_oif     = 0;
1971         rth->rt_mark    = skb->mark;
1972         rth->rt_gateway = daddr;
1973         rth->rt_spec_dst= spec_dst;
1974         rth->rt_peer_genid = 0;
1975         rth->peer = NULL;
1976         rth->fi = NULL;
1977         if (our) {
1978                 rth->dst.input= ip_local_deliver;
1979                 rth->rt_flags |= RTCF_LOCAL;
1980         }
1981
1982 #ifdef CONFIG_IP_MROUTE
1983         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1984                 rth->dst.input = ip_mr_input;
1985 #endif
1986         RT_CACHE_STAT_INC(in_slow_mc);
1987
1988         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1989         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1990         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1991
1992 e_nobufs:
1993         return -ENOBUFS;
1994 e_inval:
1995         return -EINVAL;
1996 e_err:
1997         return err;
1998 }
1999
2000
2001 static void ip_handle_martian_source(struct net_device *dev,
2002                                      struct in_device *in_dev,
2003                                      struct sk_buff *skb,
2004                                      __be32 daddr,
2005                                      __be32 saddr)
2006 {
2007         RT_CACHE_STAT_INC(in_martian_src);
2008 #ifdef CONFIG_IP_ROUTE_VERBOSE
2009         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2010                 /*
2011                  *      RFC1812 recommendation, if source is martian,
2012                  *      the only hint is MAC header.
2013                  */
2014                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2015                         &daddr, &saddr, dev->name);
2016                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2017                         int i;
2018                         const unsigned char *p = skb_mac_header(skb);
2019                         printk(KERN_WARNING "ll header: ");
2020                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2021                                 printk("%02x", *p);
2022                                 if (i < (dev->hard_header_len - 1))
2023                                         printk(":");
2024                         }
2025                         printk("\n");
2026                 }
2027         }
2028 #endif
2029 }
2030
2031 /* called in rcu_read_lock() section */
2032 static int __mkroute_input(struct sk_buff *skb,
2033                            const struct fib_result *res,
2034                            struct in_device *in_dev,
2035                            __be32 daddr, __be32 saddr, u32 tos,
2036                            struct rtable **result)
2037 {
2038         struct rtable *rth;
2039         int err;
2040         struct in_device *out_dev;
2041         unsigned int flags = 0;
2042         __be32 spec_dst;
2043         u32 itag;
2044
2045         /* get a working reference to the output device */
2046         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2047         if (out_dev == NULL) {
2048                 if (net_ratelimit())
2049                         printk(KERN_CRIT "Bug in ip_route_input" \
2050                                "_slow(). Please, report\n");
2051                 return -EINVAL;
2052         }
2053
2054
2055         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2056                                   in_dev->dev, &spec_dst, &itag);
2057         if (err < 0) {
2058                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2059                                          saddr);
2060
2061                 goto cleanup;
2062         }
2063
2064         if (err)
2065                 flags |= RTCF_DIRECTSRC;
2066
2067         if (out_dev == in_dev && err &&
2068             (IN_DEV_SHARED_MEDIA(out_dev) ||
2069              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2070                 flags |= RTCF_DOREDIRECT;
2071
2072         if (skb->protocol != htons(ETH_P_IP)) {
2073                 /* Not IP (i.e. ARP). Do not create route, if it is
2074                  * invalid for proxy arp. DNAT routes are always valid.
2075                  *
2076                  * Proxy arp feature have been extended to allow, ARP
2077                  * replies back to the same interface, to support
2078                  * Private VLAN switch technologies. See arp.c.
2079                  */
2080                 if (out_dev == in_dev &&
2081                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2082                         err = -EINVAL;
2083                         goto cleanup;
2084                 }
2085         }
2086
2087         rth = rt_dst_alloc(out_dev->dev,
2088                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2089                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2090         if (!rth) {
2091                 err = -ENOBUFS;
2092                 goto cleanup;
2093         }
2094
2095         rth->rt_key_dst = daddr;
2096         rth->rt_key_src = saddr;
2097         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2098         rth->rt_flags = flags;
2099         rth->rt_type = res->type;
2100         rth->rt_key_tos = tos;
2101         rth->rt_dst     = daddr;
2102         rth->rt_src     = saddr;
2103         rth->rt_route_iif = in_dev->dev->ifindex;
2104         rth->rt_iif     = in_dev->dev->ifindex;
2105         rth->rt_oif     = 0;
2106         rth->rt_mark    = skb->mark;
2107         rth->rt_gateway = daddr;
2108         rth->rt_spec_dst= spec_dst;
2109         rth->rt_peer_genid = 0;
2110         rth->peer = NULL;
2111         rth->fi = NULL;
2112
2113         rth->dst.input = ip_forward;
2114         rth->dst.output = ip_output;
2115
2116         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2117
2118         *result = rth;
2119         err = 0;
2120  cleanup:
2121         return err;
2122 }
2123
2124 static int ip_mkroute_input(struct sk_buff *skb,
2125                             struct fib_result *res,
2126                             const struct flowi4 *fl4,
2127                             struct in_device *in_dev,
2128                             __be32 daddr, __be32 saddr, u32 tos)
2129 {
2130         struct rtable* rth = NULL;
2131         int err;
2132         unsigned hash;
2133
2134 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2135         if (res->fi && res->fi->fib_nhs > 1)
2136                 fib_select_multipath(res);
2137 #endif
2138
2139         /* create a routing cache entry */
2140         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2141         if (err)
2142                 return err;
2143
2144         /* put it into the cache */
2145         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2146                        rt_genid(dev_net(rth->dst.dev)));
2147         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2148         if (IS_ERR(rth))
2149                 return PTR_ERR(rth);
2150         return 0;
2151 }
2152
2153 /*
2154  *      NOTE. We drop all the packets that has local source
2155  *      addresses, because every properly looped back packet
2156  *      must have correct destination already attached by output routine.
2157  *
2158  *      Such approach solves two big problems:
2159  *      1. Not simplex devices are handled properly.
2160  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2161  *      called with rcu_read_lock()
2162  */
2163
2164 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2165                                u8 tos, struct net_device *dev)
2166 {
2167         struct fib_result res;
2168         struct in_device *in_dev = __in_dev_get_rcu(dev);
2169         struct flowi4   fl4;
2170         unsigned        flags = 0;
2171         u32             itag = 0;
2172         struct rtable * rth;
2173         unsigned        hash;
2174         __be32          spec_dst;
2175         int             err = -EINVAL;
2176         struct net    * net = dev_net(dev);
2177
2178         /* IP on this device is disabled. */
2179
2180         if (!in_dev)
2181                 goto out;
2182
2183         /* Check for the most weird martians, which can be not detected
2184            by fib_lookup.
2185          */
2186
2187         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2188             ipv4_is_loopback(saddr))
2189                 goto martian_source;
2190
2191         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2192                 goto brd_input;
2193
2194         /* Accept zero addresses only to limited broadcast;
2195          * I even do not know to fix it or not. Waiting for complains :-)
2196          */
2197         if (ipv4_is_zeronet(saddr))
2198                 goto martian_source;
2199
2200         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2201                 goto martian_destination;
2202
2203         /*
2204          *      Now we are ready to route packet.
2205          */
2206         fl4.flowi4_oif = 0;
2207         fl4.flowi4_iif = dev->ifindex;
2208         fl4.flowi4_mark = skb->mark;
2209         fl4.flowi4_tos = tos;
2210         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2211         fl4.daddr = daddr;
2212         fl4.saddr = saddr;
2213         err = fib_lookup(net, &fl4, &res);
2214         if (err != 0) {
2215                 if (!IN_DEV_FORWARD(in_dev))
2216                         goto e_hostunreach;
2217                 goto no_route;
2218         }
2219
2220         RT_CACHE_STAT_INC(in_slow_tot);
2221
2222         if (res.type == RTN_BROADCAST)
2223                 goto brd_input;
2224
2225         if (res.type == RTN_LOCAL) {
2226                 err = fib_validate_source(skb, saddr, daddr, tos,
2227                                           net->loopback_dev->ifindex,
2228                                           dev, &spec_dst, &itag);
2229                 if (err < 0)
2230                         goto martian_source_keep_err;
2231                 if (err)
2232                         flags |= RTCF_DIRECTSRC;
2233                 spec_dst = daddr;
2234                 goto local_input;
2235         }
2236
2237         if (!IN_DEV_FORWARD(in_dev))
2238                 goto e_hostunreach;
2239         if (res.type != RTN_UNICAST)
2240                 goto martian_destination;
2241
2242         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2243 out:    return err;
2244
2245 brd_input:
2246         if (skb->protocol != htons(ETH_P_IP))
2247                 goto e_inval;
2248
2249         if (ipv4_is_zeronet(saddr))
2250                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2251         else {
2252                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2253                                           &itag);
2254                 if (err < 0)
2255                         goto martian_source_keep_err;
2256                 if (err)
2257                         flags |= RTCF_DIRECTSRC;
2258         }
2259         flags |= RTCF_BROADCAST;
2260         res.type = RTN_BROADCAST;
2261         RT_CACHE_STAT_INC(in_brd);
2262
2263 local_input:
2264         rth = rt_dst_alloc(net->loopback_dev,
2265                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2266         if (!rth)
2267                 goto e_nobufs;
2268
2269         rth->dst.input= ip_local_deliver;
2270         rth->dst.output= ip_rt_bug;
2271 #ifdef CONFIG_IP_ROUTE_CLASSID
2272         rth->dst.tclassid = itag;
2273 #endif
2274
2275         rth->rt_key_dst = daddr;
2276         rth->rt_key_src = saddr;
2277         rth->rt_genid = rt_genid(net);
2278         rth->rt_flags   = flags|RTCF_LOCAL;
2279         rth->rt_type    = res.type;
2280         rth->rt_key_tos = tos;
2281         rth->rt_dst     = daddr;
2282         rth->rt_src     = saddr;
2283 #ifdef CONFIG_IP_ROUTE_CLASSID
2284         rth->dst.tclassid = itag;
2285 #endif
2286         rth->rt_route_iif = dev->ifindex;
2287         rth->rt_iif     = dev->ifindex;
2288         rth->rt_oif     = 0;
2289         rth->rt_mark    = skb->mark;
2290         rth->rt_gateway = daddr;
2291         rth->rt_spec_dst= spec_dst;
2292         rth->rt_peer_genid = 0;
2293         rth->peer = NULL;
2294         rth->fi = NULL;
2295         if (res.type == RTN_UNREACHABLE) {
2296                 rth->dst.input= ip_error;
2297                 rth->dst.error= -err;
2298                 rth->rt_flags   &= ~RTCF_LOCAL;
2299         }
2300         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2301         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2302         err = 0;
2303         if (IS_ERR(rth))
2304                 err = PTR_ERR(rth);
2305         goto out;
2306
2307 no_route:
2308         RT_CACHE_STAT_INC(in_no_route);
2309         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2310         res.type = RTN_UNREACHABLE;
2311         if (err == -ESRCH)
2312                 err = -ENETUNREACH;
2313         goto local_input;
2314
2315         /*
2316          *      Do not cache martian addresses: they should be logged (RFC1812)
2317          */
2318 martian_destination:
2319         RT_CACHE_STAT_INC(in_martian_dst);
2320 #ifdef CONFIG_IP_ROUTE_VERBOSE
2321         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2322                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2323                         &daddr, &saddr, dev->name);
2324 #endif
2325
2326 e_hostunreach:
2327         err = -EHOSTUNREACH;
2328         goto out;
2329
2330 e_inval:
2331         err = -EINVAL;
2332         goto out;
2333
2334 e_nobufs:
2335         err = -ENOBUFS;
2336         goto out;
2337
2338 martian_source:
2339         err = -EINVAL;
2340 martian_source_keep_err:
2341         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2342         goto out;
2343 }
2344
2345 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2346                            u8 tos, struct net_device *dev, bool noref)
2347 {
2348         struct rtable * rth;
2349         unsigned        hash;
2350         int iif = dev->ifindex;
2351         struct net *net;
2352         int res;
2353
2354         net = dev_net(dev);
2355
2356         rcu_read_lock();
2357
2358         if (!rt_caching(net))
2359                 goto skip_cache;
2360
2361         tos &= IPTOS_RT_MASK;
2362         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2363
2364         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2365              rth = rcu_dereference(rth->dst.rt_next)) {
2366                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2367                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2368                      (rth->rt_route_iif ^ iif) |
2369                      (rth->rt_key_tos ^ tos)) == 0 &&
2370                     rth->rt_mark == skb->mark &&
2371                     net_eq(dev_net(rth->dst.dev), net) &&
2372                     !rt_is_expired(rth)) {
2373                         if (noref) {
2374                                 dst_use_noref(&rth->dst, jiffies);
2375                                 skb_dst_set_noref(skb, &rth->dst);
2376                         } else {
2377                                 dst_use(&rth->dst, jiffies);
2378                                 skb_dst_set(skb, &rth->dst);
2379                         }
2380                         RT_CACHE_STAT_INC(in_hit);
2381                         rcu_read_unlock();
2382                         return 0;
2383                 }
2384                 RT_CACHE_STAT_INC(in_hlist_search);
2385         }
2386
2387 skip_cache:
2388         /* Multicast recognition logic is moved from route cache to here.
2389            The problem was that too many Ethernet cards have broken/missing
2390            hardware multicast filters :-( As result the host on multicasting
2391            network acquires a lot of useless route cache entries, sort of
2392            SDR messages from all the world. Now we try to get rid of them.
2393            Really, provided software IP multicast filter is organized
2394            reasonably (at least, hashed), it does not result in a slowdown
2395            comparing with route cache reject entries.
2396            Note, that multicast routers are not affected, because
2397            route cache entry is created eventually.
2398          */
2399         if (ipv4_is_multicast(daddr)) {
2400                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2401
2402                 if (in_dev) {
2403                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2404                                                   ip_hdr(skb)->protocol);
2405                         if (our
2406 #ifdef CONFIG_IP_MROUTE
2407                                 ||
2408                             (!ipv4_is_local_multicast(daddr) &&
2409                              IN_DEV_MFORWARD(in_dev))
2410 #endif
2411                            ) {
2412                                 int res = ip_route_input_mc(skb, daddr, saddr,
2413                                                             tos, dev, our);
2414                                 rcu_read_unlock();
2415                                 return res;
2416                         }
2417                 }
2418                 rcu_read_unlock();
2419                 return -EINVAL;
2420         }
2421         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2422         rcu_read_unlock();
2423         return res;
2424 }
2425 EXPORT_SYMBOL(ip_route_input_common);
2426
2427 /* called with rcu_read_lock() */
2428 static struct rtable *__mkroute_output(const struct fib_result *res,
2429                                        const struct flowi4 *fl4,
2430                                        __be32 orig_daddr, __be32 orig_saddr,
2431                                        int orig_oif, struct net_device *dev_out,
2432                                        unsigned int flags)
2433 {
2434         struct fib_info *fi = res->fi;
2435         u32 tos = RT_FL_TOS(fl4);
2436         struct in_device *in_dev;
2437         u16 type = res->type;
2438         struct rtable *rth;
2439
2440         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2441                 return ERR_PTR(-EINVAL);
2442
2443         if (ipv4_is_lbcast(fl4->daddr))
2444                 type = RTN_BROADCAST;
2445         else if (ipv4_is_multicast(fl4->daddr))
2446                 type = RTN_MULTICAST;
2447         else if (ipv4_is_zeronet(fl4->daddr))
2448                 return ERR_PTR(-EINVAL);
2449
2450         if (dev_out->flags & IFF_LOOPBACK)
2451                 flags |= RTCF_LOCAL;
2452
2453         in_dev = __in_dev_get_rcu(dev_out);
2454         if (!in_dev)
2455                 return ERR_PTR(-EINVAL);
2456
2457         if (type == RTN_BROADCAST) {
2458                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2459                 fi = NULL;
2460         } else if (type == RTN_MULTICAST) {
2461                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2462                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2463                                      fl4->flowi4_proto))
2464                         flags &= ~RTCF_LOCAL;
2465                 /* If multicast route do not exist use
2466                  * default one, but do not gateway in this case.
2467                  * Yes, it is hack.
2468                  */
2469                 if (fi && res->prefixlen < 4)
2470                         fi = NULL;
2471         }
2472
2473         rth = rt_dst_alloc(dev_out,
2474                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2475                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2476         if (!rth)
2477                 return ERR_PTR(-ENOBUFS);
2478
2479         rth->dst.output = ip_output;
2480
2481         rth->rt_key_dst = orig_daddr;
2482         rth->rt_key_src = orig_saddr;
2483         rth->rt_genid = rt_genid(dev_net(dev_out));
2484         rth->rt_flags   = flags;
2485         rth->rt_type    = type;
2486         rth->rt_key_tos = tos;
2487         rth->rt_dst     = fl4->daddr;
2488         rth->rt_src     = fl4->saddr;
2489         rth->rt_route_iif = 0;
2490         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2491         rth->rt_oif     = orig_oif;
2492         rth->rt_mark    = fl4->flowi4_mark;
2493         rth->rt_gateway = fl4->daddr;
2494         rth->rt_spec_dst= fl4->saddr;
2495         rth->rt_peer_genid = 0;
2496         rth->peer = NULL;
2497         rth->fi = NULL;
2498
2499         RT_CACHE_STAT_INC(out_slow_tot);
2500
2501         if (flags & RTCF_LOCAL) {
2502                 rth->dst.input = ip_local_deliver;
2503                 rth->rt_spec_dst = fl4->daddr;
2504         }
2505         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2506                 rth->rt_spec_dst = fl4->saddr;
2507                 if (flags & RTCF_LOCAL &&
2508                     !(dev_out->flags & IFF_LOOPBACK)) {
2509                         rth->dst.output = ip_mc_output;
2510                         RT_CACHE_STAT_INC(out_slow_mc);
2511                 }
2512 #ifdef CONFIG_IP_MROUTE
2513                 if (type == RTN_MULTICAST) {
2514                         if (IN_DEV_MFORWARD(in_dev) &&
2515                             !ipv4_is_local_multicast(fl4->daddr)) {
2516                                 rth->dst.input = ip_mr_input;
2517                                 rth->dst.output = ip_mc_output;
2518                         }
2519                 }
2520 #endif
2521         }
2522
2523         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2524
2525         return rth;
2526 }
2527
2528 /*
2529  * Major route resolver routine.
2530  * called with rcu_read_lock();
2531  */
2532
2533 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2534 {
2535         struct net_device *dev_out = NULL;
2536         u32 tos = RT_FL_TOS(fl4);
2537         unsigned int flags = 0;
2538         struct fib_result res;
2539         struct rtable *rth;
2540         __be32 orig_daddr;
2541         __be32 orig_saddr;
2542         int orig_oif;
2543
2544         res.fi          = NULL;
2545 #ifdef CONFIG_IP_MULTIPLE_TABLES
2546         res.r           = NULL;
2547 #endif
2548
2549         orig_daddr = fl4->daddr;
2550         orig_saddr = fl4->saddr;
2551         orig_oif = fl4->flowi4_oif;
2552
2553         fl4->flowi4_iif = net->loopback_dev->ifindex;
2554         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2555         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2556                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2557
2558         rcu_read_lock();
2559         if (fl4->saddr) {
2560                 rth = ERR_PTR(-EINVAL);
2561                 if (ipv4_is_multicast(fl4->saddr) ||
2562                     ipv4_is_lbcast(fl4->saddr) ||
2563                     ipv4_is_zeronet(fl4->saddr))
2564                         goto out;
2565
2566                 /* I removed check for oif == dev_out->oif here.
2567                    It was wrong for two reasons:
2568                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2569                       is assigned to multiple interfaces.
2570                    2. Moreover, we are allowed to send packets with saddr
2571                       of another iface. --ANK
2572                  */
2573
2574                 if (fl4->flowi4_oif == 0 &&
2575                     (ipv4_is_multicast(fl4->daddr) ||
2576                      ipv4_is_lbcast(fl4->daddr))) {
2577                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2578                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2579                         if (dev_out == NULL)
2580                                 goto out;
2581
2582                         /* Special hack: user can direct multicasts
2583                            and limited broadcast via necessary interface
2584                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2585                            This hack is not just for fun, it allows
2586                            vic,vat and friends to work.
2587                            They bind socket to loopback, set ttl to zero
2588                            and expect that it will work.
2589                            From the viewpoint of routing cache they are broken,
2590                            because we are not allowed to build multicast path
2591                            with loopback source addr (look, routing cache
2592                            cannot know, that ttl is zero, so that packet
2593                            will not leave this host and route is valid).
2594                            Luckily, this hack is good workaround.
2595                          */
2596
2597                         fl4->flowi4_oif = dev_out->ifindex;
2598                         goto make_route;
2599                 }
2600
2601                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2602                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2603                         if (!__ip_dev_find(net, fl4->saddr, false))
2604                                 goto out;
2605                 }
2606         }
2607
2608
2609         if (fl4->flowi4_oif) {
2610                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2611                 rth = ERR_PTR(-ENODEV);
2612                 if (dev_out == NULL)
2613                         goto out;
2614
2615                 /* RACE: Check return value of inet_select_addr instead. */
2616                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2617                         rth = ERR_PTR(-ENETUNREACH);
2618                         goto out;
2619                 }
2620                 if (ipv4_is_local_multicast(fl4->daddr) ||
2621                     ipv4_is_lbcast(fl4->daddr)) {
2622                         if (!fl4->saddr)
2623                                 fl4->saddr = inet_select_addr(dev_out, 0,
2624                                                               RT_SCOPE_LINK);
2625                         goto make_route;
2626                 }
2627                 if (fl4->saddr) {
2628                         if (ipv4_is_multicast(fl4->daddr))
2629                                 fl4->saddr = inet_select_addr(dev_out, 0,
2630                                                               fl4->flowi4_scope);
2631                         else if (!fl4->daddr)
2632                                 fl4->saddr = inet_select_addr(dev_out, 0,
2633                                                               RT_SCOPE_HOST);
2634                 }
2635         }
2636
2637         if (!fl4->daddr) {
2638                 fl4->daddr = fl4->saddr;
2639                 if (!fl4->daddr)
2640                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2641                 dev_out = net->loopback_dev;
2642                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2643                 res.type = RTN_LOCAL;
2644                 flags |= RTCF_LOCAL;
2645                 goto make_route;
2646         }
2647
2648         if (fib_lookup(net, fl4, &res)) {
2649                 res.fi = NULL;
2650                 if (fl4->flowi4_oif) {
2651                         /* Apparently, routing tables are wrong. Assume,
2652                            that the destination is on link.
2653
2654                            WHY? DW.
2655                            Because we are allowed to send to iface
2656                            even if it has NO routes and NO assigned
2657                            addresses. When oif is specified, routing
2658                            tables are looked up with only one purpose:
2659                            to catch if destination is gatewayed, rather than
2660                            direct. Moreover, if MSG_DONTROUTE is set,
2661                            we send packet, ignoring both routing tables
2662                            and ifaddr state. --ANK
2663
2664
2665                            We could make it even if oif is unknown,
2666                            likely IPv6, but we do not.
2667                          */
2668
2669                         if (fl4->saddr == 0)
2670                                 fl4->saddr = inet_select_addr(dev_out, 0,
2671                                                               RT_SCOPE_LINK);
2672                         res.type = RTN_UNICAST;
2673                         goto make_route;
2674                 }
2675                 rth = ERR_PTR(-ENETUNREACH);
2676                 goto out;
2677         }
2678
2679         if (res.type == RTN_LOCAL) {
2680                 if (!fl4->saddr) {
2681                         if (res.fi->fib_prefsrc)
2682                                 fl4->saddr = res.fi->fib_prefsrc;
2683                         else
2684                                 fl4->saddr = fl4->daddr;
2685                 }
2686                 dev_out = net->loopback_dev;
2687                 fl4->flowi4_oif = dev_out->ifindex;
2688                 res.fi = NULL;
2689                 flags |= RTCF_LOCAL;
2690                 goto make_route;
2691         }
2692
2693 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2694         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2695                 fib_select_multipath(&res);
2696         else
2697 #endif
2698         if (!res.prefixlen &&
2699             res.table->tb_num_default > 1 &&
2700             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2701                 fib_select_default(&res);
2702
2703         if (!fl4->saddr)
2704                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2705
2706         dev_out = FIB_RES_DEV(res);
2707         fl4->flowi4_oif = dev_out->ifindex;
2708
2709
2710 make_route:
2711         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2712                                dev_out, flags);
2713         if (!IS_ERR(rth)) {
2714                 unsigned int hash;
2715
2716                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2717                                rt_genid(dev_net(dev_out)));
2718                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2719         }
2720
2721 out:
2722         rcu_read_unlock();
2723         return rth;
2724 }
2725
2726 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2727 {
2728         struct rtable *rth;
2729         unsigned int hash;
2730
2731         if (!rt_caching(net))
2732                 goto slow_output;
2733
2734         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2735
2736         rcu_read_lock_bh();
2737         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2738                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2739                 if (rth->rt_key_dst == flp4->daddr &&
2740                     rth->rt_key_src == flp4->saddr &&
2741                     rt_is_output_route(rth) &&
2742                     rth->rt_oif == flp4->flowi4_oif &&
2743                     rth->rt_mark == flp4->flowi4_mark &&
2744                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2745                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2746                     net_eq(dev_net(rth->dst.dev), net) &&
2747                     !rt_is_expired(rth)) {
2748                         dst_use(&rth->dst, jiffies);
2749                         RT_CACHE_STAT_INC(out_hit);
2750                         rcu_read_unlock_bh();
2751                         if (!flp4->saddr)
2752                                 flp4->saddr = rth->rt_src;
2753                         if (!flp4->daddr)
2754                                 flp4->daddr = rth->rt_dst;
2755                         return rth;
2756                 }
2757                 RT_CACHE_STAT_INC(out_hlist_search);
2758         }
2759         rcu_read_unlock_bh();
2760
2761 slow_output:
2762         return ip_route_output_slow(net, flp4);
2763 }
2764 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2765
2766 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2767 {
2768         return NULL;
2769 }
2770
2771 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2772 {
2773         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2774
2775         return mtu ? : dst->dev->mtu;
2776 }
2777
2778 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2779 {
2780 }
2781
2782 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2783                                           unsigned long old)
2784 {
2785         return NULL;
2786 }
2787
2788 static struct dst_ops ipv4_dst_blackhole_ops = {
2789         .family                 =       AF_INET,
2790         .protocol               =       cpu_to_be16(ETH_P_IP),
2791         .destroy                =       ipv4_dst_destroy,
2792         .check                  =       ipv4_blackhole_dst_check,
2793         .mtu                    =       ipv4_blackhole_mtu,
2794         .default_advmss         =       ipv4_default_advmss,
2795         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2796         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2797         .neigh_lookup           =       ipv4_neigh_lookup,
2798 };
2799
2800 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2801 {
2802         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2803         struct rtable *ort = (struct rtable *) dst_orig;
2804
2805         if (rt) {
2806                 struct dst_entry *new = &rt->dst;
2807
2808                 new->__use = 1;
2809                 new->input = dst_discard;
2810                 new->output = dst_discard;
2811                 dst_copy_metrics(new, &ort->dst);
2812
2813                 new->dev = ort->dst.dev;
2814                 if (new->dev)
2815                         dev_hold(new->dev);
2816
2817                 rt->rt_key_dst = ort->rt_key_dst;
2818                 rt->rt_key_src = ort->rt_key_src;
2819                 rt->rt_key_tos = ort->rt_key_tos;
2820                 rt->rt_route_iif = ort->rt_route_iif;
2821                 rt->rt_iif = ort->rt_iif;
2822                 rt->rt_oif = ort->rt_oif;
2823                 rt->rt_mark = ort->rt_mark;
2824
2825                 rt->rt_genid = rt_genid(net);
2826                 rt->rt_flags = ort->rt_flags;
2827                 rt->rt_type = ort->rt_type;
2828                 rt->rt_dst = ort->rt_dst;
2829                 rt->rt_src = ort->rt_src;
2830                 rt->rt_gateway = ort->rt_gateway;
2831                 rt->rt_spec_dst = ort->rt_spec_dst;
2832                 rt->peer = ort->peer;
2833                 if (rt->peer)
2834                         atomic_inc(&rt->peer->refcnt);
2835                 rt->fi = ort->fi;
2836                 if (rt->fi)
2837                         atomic_inc(&rt->fi->fib_clntref);
2838
2839                 dst_free(new);
2840         }
2841
2842         dst_release(dst_orig);
2843
2844         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2845 }
2846
2847 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2848                                     struct sock *sk)
2849 {
2850         struct rtable *rt = __ip_route_output_key(net, flp4);
2851
2852         if (IS_ERR(rt))
2853                 return rt;
2854
2855         if (flp4->flowi4_proto)
2856                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2857                                                    flowi4_to_flowi(flp4),
2858                                                    sk, 0);
2859
2860         return rt;
2861 }
2862 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2863
2864 static int rt_fill_info(struct net *net,
2865                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2866                         int nowait, unsigned int flags)
2867 {
2868         struct rtable *rt = skb_rtable(skb);
2869         struct rtmsg *r;
2870         struct nlmsghdr *nlh;
2871         unsigned long expires = 0;
2872         const struct inet_peer *peer = rt->peer;
2873         u32 id = 0, ts = 0, tsage = 0, error;
2874
2875         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2876         if (nlh == NULL)
2877                 return -EMSGSIZE;
2878
2879         r = nlmsg_data(nlh);
2880         r->rtm_family    = AF_INET;
2881         r->rtm_dst_len  = 32;
2882         r->rtm_src_len  = 0;
2883         r->rtm_tos      = rt->rt_key_tos;
2884         r->rtm_table    = RT_TABLE_MAIN;
2885         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2886         r->rtm_type     = rt->rt_type;
2887         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2888         r->rtm_protocol = RTPROT_UNSPEC;
2889         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2890         if (rt->rt_flags & RTCF_NOTIFY)
2891                 r->rtm_flags |= RTM_F_NOTIFY;
2892
2893         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2894
2895         if (rt->rt_key_src) {
2896                 r->rtm_src_len = 32;
2897                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2898         }
2899         if (rt->dst.dev)
2900                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2901 #ifdef CONFIG_IP_ROUTE_CLASSID
2902         if (rt->dst.tclassid)
2903                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2904 #endif
2905         if (rt_is_input_route(rt))
2906                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2907         else if (rt->rt_src != rt->rt_key_src)
2908                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2909
2910         if (rt->rt_dst != rt->rt_gateway)
2911                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2912
2913         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2914                 goto nla_put_failure;
2915
2916         if (rt->rt_mark)
2917                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2918
2919         error = rt->dst.error;
2920         if (peer) {
2921                 inet_peer_refcheck(rt->peer);
2922                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2923                 if (peer->tcp_ts_stamp) {
2924                         ts = peer->tcp_ts;
2925                         tsage = get_seconds() - peer->tcp_ts_stamp;
2926                 }
2927                 expires = ACCESS_ONCE(peer->pmtu_expires);
2928                 if (expires) {
2929                         if (time_before(jiffies, expires))
2930                                 expires -= jiffies;
2931                         else
2932                                 expires = 0;
2933                 }
2934         }
2935
2936         if (rt_is_input_route(rt)) {
2937 #ifdef CONFIG_IP_MROUTE
2938                 __be32 dst = rt->rt_dst;
2939
2940                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2941                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2942                         int err = ipmr_get_route(net, skb,
2943                                                  rt->rt_src, rt->rt_dst,
2944                                                  r, nowait);
2945                         if (err <= 0) {
2946                                 if (!nowait) {
2947                                         if (err == 0)
2948                                                 return 0;
2949                                         goto nla_put_failure;
2950                                 } else {
2951                                         if (err == -EMSGSIZE)
2952                                                 goto nla_put_failure;
2953                                         error = err;
2954                                 }
2955                         }
2956                 } else
2957 #endif
2958                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2959         }
2960
2961         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2962                                expires, error) < 0)
2963                 goto nla_put_failure;
2964
2965         return nlmsg_end(skb, nlh);
2966
2967 nla_put_failure:
2968         nlmsg_cancel(skb, nlh);
2969         return -EMSGSIZE;
2970 }
2971
2972 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2973 {
2974         struct net *net = sock_net(in_skb->sk);
2975         struct rtmsg *rtm;
2976         struct nlattr *tb[RTA_MAX+1];
2977         struct rtable *rt = NULL;
2978         __be32 dst = 0;
2979         __be32 src = 0;
2980         u32 iif;
2981         int err;
2982         int mark;
2983         struct sk_buff *skb;
2984
2985         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2986         if (err < 0)
2987                 goto errout;
2988
2989         rtm = nlmsg_data(nlh);
2990
2991         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2992         if (skb == NULL) {
2993                 err = -ENOBUFS;
2994                 goto errout;
2995         }
2996
2997         /* Reserve room for dummy headers, this skb can pass
2998            through good chunk of routing engine.
2999          */
3000         skb_reset_mac_header(skb);
3001         skb_reset_network_header(skb);
3002
3003         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3004         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3005         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3006
3007         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3008         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3009         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3010         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3011
3012         if (iif) {
3013                 struct net_device *dev;
3014
3015                 dev = __dev_get_by_index(net, iif);
3016                 if (dev == NULL) {
3017                         err = -ENODEV;
3018                         goto errout_free;
3019                 }
3020
3021                 skb->protocol   = htons(ETH_P_IP);
3022                 skb->dev        = dev;
3023                 skb->mark       = mark;
3024                 local_bh_disable();
3025                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3026                 local_bh_enable();
3027
3028                 rt = skb_rtable(skb);
3029                 if (err == 0 && rt->dst.error)
3030                         err = -rt->dst.error;
3031         } else {
3032                 struct flowi4 fl4 = {
3033                         .daddr = dst,
3034                         .saddr = src,
3035                         .flowi4_tos = rtm->rtm_tos,
3036                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3037                         .flowi4_mark = mark,
3038                 };
3039                 rt = ip_route_output_key(net, &fl4);
3040
3041                 err = 0;
3042                 if (IS_ERR(rt))
3043                         err = PTR_ERR(rt);
3044         }
3045
3046         if (err)
3047                 goto errout_free;
3048
3049         skb_dst_set(skb, &rt->dst);
3050         if (rtm->rtm_flags & RTM_F_NOTIFY)
3051                 rt->rt_flags |= RTCF_NOTIFY;
3052
3053         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3054                            RTM_NEWROUTE, 0, 0);
3055         if (err <= 0)
3056                 goto errout_free;
3057
3058         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3059 errout:
3060         return err;
3061
3062 errout_free:
3063         kfree_skb(skb);
3064         goto errout;
3065 }
3066
3067 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3068 {
3069         struct rtable *rt;
3070         int h, s_h;
3071         int idx, s_idx;
3072         struct net *net;
3073
3074         net = sock_net(skb->sk);
3075
3076         s_h = cb->args[0];
3077         if (s_h < 0)
3078                 s_h = 0;
3079         s_idx = idx = cb->args[1];
3080         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3081                 if (!rt_hash_table[h].chain)
3082                         continue;
3083                 rcu_read_lock_bh();
3084                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3085                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3086                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3087                                 continue;
3088                         if (rt_is_expired(rt))
3089                                 continue;
3090                         skb_dst_set_noref(skb, &rt->dst);
3091                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3092                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3093                                          1, NLM_F_MULTI) <= 0) {
3094                                 skb_dst_drop(skb);
3095                                 rcu_read_unlock_bh();
3096                                 goto done;
3097                         }
3098                         skb_dst_drop(skb);
3099                 }
3100                 rcu_read_unlock_bh();
3101         }
3102
3103 done:
3104         cb->args[0] = h;
3105         cb->args[1] = idx;
3106         return skb->len;
3107 }
3108
3109 void ip_rt_multicast_event(struct in_device *in_dev)
3110 {
3111         rt_cache_flush(dev_net(in_dev->dev), 0);
3112 }
3113
3114 #ifdef CONFIG_SYSCTL
3115 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3116                                         void __user *buffer,
3117                                         size_t *lenp, loff_t *ppos)
3118 {
3119         if (write) {
3120                 int flush_delay;
3121                 ctl_table ctl;
3122                 struct net *net;
3123
3124                 memcpy(&ctl, __ctl, sizeof(ctl));
3125                 ctl.data = &flush_delay;
3126                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3127
3128                 net = (struct net *)__ctl->extra1;
3129                 rt_cache_flush(net, flush_delay);
3130                 return 0;
3131         }
3132
3133         return -EINVAL;
3134 }
3135
3136 static ctl_table ipv4_route_table[] = {
3137         {
3138                 .procname       = "gc_thresh",
3139                 .data           = &ipv4_dst_ops.gc_thresh,
3140                 .maxlen         = sizeof(int),
3141                 .mode           = 0644,
3142                 .proc_handler   = proc_dointvec,
3143         },
3144         {
3145                 .procname       = "max_size",
3146                 .data           = &ip_rt_max_size,
3147                 .maxlen         = sizeof(int),
3148                 .mode           = 0644,
3149                 .proc_handler   = proc_dointvec,
3150         },
3151         {
3152                 /*  Deprecated. Use gc_min_interval_ms */
3153
3154                 .procname       = "gc_min_interval",
3155                 .data           = &ip_rt_gc_min_interval,
3156                 .maxlen         = sizeof(int),
3157                 .mode           = 0644,
3158                 .proc_handler   = proc_dointvec_jiffies,
3159         },
3160         {
3161                 .procname       = "gc_min_interval_ms",
3162                 .data           = &ip_rt_gc_min_interval,
3163                 .maxlen         = sizeof(int),
3164                 .mode           = 0644,
3165                 .proc_handler   = proc_dointvec_ms_jiffies,
3166         },
3167         {
3168                 .procname       = "gc_timeout",
3169                 .data           = &ip_rt_gc_timeout,
3170                 .maxlen         = sizeof(int),
3171                 .mode           = 0644,
3172                 .proc_handler   = proc_dointvec_jiffies,
3173         },
3174         {
3175                 .procname       = "redirect_load",
3176                 .data           = &ip_rt_redirect_load,
3177                 .maxlen         = sizeof(int),
3178                 .mode           = 0644,
3179                 .proc_handler   = proc_dointvec,
3180         },
3181         {
3182                 .procname       = "redirect_number",
3183                 .data           = &ip_rt_redirect_number,
3184                 .maxlen         = sizeof(int),
3185                 .mode           = 0644,
3186                 .proc_handler   = proc_dointvec,
3187         },
3188         {
3189                 .procname       = "redirect_silence",
3190                 .data           = &ip_rt_redirect_silence,
3191                 .maxlen         = sizeof(int),
3192                 .mode           = 0644,
3193                 .proc_handler   = proc_dointvec,
3194         },
3195         {
3196                 .procname       = "error_cost",
3197                 .data           = &ip_rt_error_cost,
3198                 .maxlen         = sizeof(int),
3199                 .mode           = 0644,
3200                 .proc_handler   = proc_dointvec,
3201         },
3202         {
3203                 .procname       = "error_burst",
3204                 .data           = &ip_rt_error_burst,
3205                 .maxlen         = sizeof(int),
3206                 .mode           = 0644,
3207                 .proc_handler   = proc_dointvec,
3208         },
3209         {
3210                 .procname       = "gc_elasticity",
3211                 .data           = &ip_rt_gc_elasticity,
3212                 .maxlen         = sizeof(int),
3213                 .mode           = 0644,
3214                 .proc_handler   = proc_dointvec,
3215         },
3216         {
3217                 .procname       = "mtu_expires",
3218                 .data           = &ip_rt_mtu_expires,
3219                 .maxlen         = sizeof(int),
3220                 .mode           = 0644,
3221                 .proc_handler   = proc_dointvec_jiffies,
3222         },
3223         {
3224                 .procname       = "min_pmtu",
3225                 .data           = &ip_rt_min_pmtu,
3226                 .maxlen         = sizeof(int),
3227                 .mode           = 0644,
3228                 .proc_handler   = proc_dointvec,
3229         },
3230         {
3231                 .procname       = "min_adv_mss",
3232                 .data           = &ip_rt_min_advmss,
3233                 .maxlen         = sizeof(int),
3234                 .mode           = 0644,
3235                 .proc_handler   = proc_dointvec,
3236         },
3237         { }
3238 };
3239
3240 static struct ctl_table empty[1];
3241
3242 static struct ctl_table ipv4_skeleton[] =
3243 {
3244         { .procname = "route",
3245           .mode = 0555, .child = ipv4_route_table},
3246         { .procname = "neigh",
3247           .mode = 0555, .child = empty},
3248         { }
3249 };
3250
3251 static __net_initdata struct ctl_path ipv4_path[] = {
3252         { .procname = "net", },
3253         { .procname = "ipv4", },
3254         { },
3255 };
3256
3257 static struct ctl_table ipv4_route_flush_table[] = {
3258         {
3259                 .procname       = "flush",
3260                 .maxlen         = sizeof(int),
3261                 .mode           = 0200,
3262                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3263         },
3264         { },
3265 };
3266
3267 static __net_initdata struct ctl_path ipv4_route_path[] = {
3268         { .procname = "net", },
3269         { .procname = "ipv4", },
3270         { .procname = "route", },
3271         { },
3272 };
3273
3274 static __net_init int sysctl_route_net_init(struct net *net)
3275 {
3276         struct ctl_table *tbl;
3277
3278         tbl = ipv4_route_flush_table;
3279         if (!net_eq(net, &init_net)) {
3280                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3281                 if (tbl == NULL)
3282                         goto err_dup;
3283         }
3284         tbl[0].extra1 = net;
3285
3286         net->ipv4.route_hdr =
3287                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3288         if (net->ipv4.route_hdr == NULL)
3289                 goto err_reg;
3290         return 0;
3291
3292 err_reg:
3293         if (tbl != ipv4_route_flush_table)
3294                 kfree(tbl);
3295 err_dup:
3296         return -ENOMEM;
3297 }
3298
3299 static __net_exit void sysctl_route_net_exit(struct net *net)
3300 {
3301         struct ctl_table *tbl;
3302
3303         tbl = net->ipv4.route_hdr->ctl_table_arg;
3304         unregister_net_sysctl_table(net->ipv4.route_hdr);
3305         BUG_ON(tbl == ipv4_route_flush_table);
3306         kfree(tbl);
3307 }
3308
3309 static __net_initdata struct pernet_operations sysctl_route_ops = {
3310         .init = sysctl_route_net_init,
3311         .exit = sysctl_route_net_exit,
3312 };
3313 #endif
3314
3315 static __net_init int rt_genid_init(struct net *net)
3316 {
3317         get_random_bytes(&net->ipv4.rt_genid,
3318                          sizeof(net->ipv4.rt_genid));
3319         get_random_bytes(&net->ipv4.dev_addr_genid,
3320                          sizeof(net->ipv4.dev_addr_genid));
3321         return 0;
3322 }
3323
3324 static __net_initdata struct pernet_operations rt_genid_ops = {
3325         .init = rt_genid_init,
3326 };
3327
3328
3329 #ifdef CONFIG_IP_ROUTE_CLASSID
3330 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3331 #endif /* CONFIG_IP_ROUTE_CLASSID */
3332
3333 static __initdata unsigned long rhash_entries;
3334 static int __init set_rhash_entries(char *str)
3335 {
3336         if (!str)
3337                 return 0;
3338         rhash_entries = simple_strtoul(str, &str, 0);
3339         return 1;
3340 }
3341 __setup("rhash_entries=", set_rhash_entries);
3342
3343 int __init ip_rt_init(void)
3344 {
3345         int rc = 0;
3346
3347 #ifdef CONFIG_IP_ROUTE_CLASSID
3348         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3349         if (!ip_rt_acct)
3350                 panic("IP: failed to allocate ip_rt_acct\n");
3351 #endif
3352
3353         ipv4_dst_ops.kmem_cachep =
3354                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3355                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3356
3357         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3358
3359         if (dst_entries_init(&ipv4_dst_ops) < 0)
3360                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3361
3362         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3363                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3364
3365         rt_hash_table = (struct rt_hash_bucket *)
3366                 alloc_large_system_hash("IP route cache",
3367                                         sizeof(struct rt_hash_bucket),
3368                                         rhash_entries,
3369                                         (totalram_pages >= 128 * 1024) ?
3370                                         15 : 17,
3371                                         0,
3372                                         &rt_hash_log,
3373                                         &rt_hash_mask,
3374                                         rhash_entries ? 0 : 512 * 1024);
3375         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3376         rt_hash_lock_init();
3377
3378         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3379         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3380
3381         devinet_init();
3382         ip_fib_init();
3383
3384         if (ip_rt_proc_init())
3385                 printk(KERN_ERR "Unable to create route proc files\n");
3386 #ifdef CONFIG_XFRM
3387         xfrm_init();
3388         xfrm4_init(ip_rt_max_size);
3389 #endif
3390         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3391
3392 #ifdef CONFIG_SYSCTL
3393         register_pernet_subsys(&sysctl_route_ops);
3394 #endif
3395         register_pernet_subsys(&rt_genid_ops);
3396         return rc;
3397 }
3398
3399 #ifdef CONFIG_SYSCTL
3400 /*
3401  * We really need to sanitize the damn ipv4 init order, then all
3402  * this nonsense will go away.
3403  */
3404 void __init ip_static_sysctl_init(void)
3405 {
3406         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3407 }
3408 #endif