net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #define RT_FL_TOS(oldflp4) \
 118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 119
 120 #define RT_GC_TIMEOUT (300*HZ)
 121
 122 static int ip_rt_max_size;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131
 132 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 133 /*
 134  *      Interface to generic destination cache.
 135  */
 136
 137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141 static void              ipv4_link_failure(struct sk_buff *skb);
 142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                            struct sk_buff *skb, u32 mtu);
 144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                         struct sk_buff *skb);
 146 static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149 {
 150         WARN_ON(1);
 151         return NULL;
 152 }
 153
 154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155                                            struct sk_buff *skb,
 156                                            const void *daddr);
 157
 158 static struct dst_ops ipv4_dst_ops = {
 159         .family =               AF_INET,
 160         .check =                ipv4_dst_check,
 161         .default_advmss =       ipv4_default_advmss,
 162         .mtu =                  ipv4_mtu,
 163         .cow_metrics =          ipv4_cow_metrics,
 164         .destroy =              ipv4_dst_destroy,
 165         .negative_advice =      ipv4_negative_advice,
 166         .link_failure =         ipv4_link_failure,
 167         .update_pmtu =          ip_rt_update_pmtu,
 168         .redirect =             ip_do_redirect,
 169         .local_out =            __ip_local_out,
 170         .neigh_lookup =         ipv4_neigh_lookup,
 171 };
 172
 173 #define ECN_OR_COST(class)      TC_PRIO_##class
 174
 175 const __u8 ip_tos2prio[16] = {
 176         TC_PRIO_BESTEFFORT,
 177         ECN_OR_COST(BESTEFFORT),
 178         TC_PRIO_BESTEFFORT,
 179         ECN_OR_COST(BESTEFFORT),
 180         TC_PRIO_BULK,
 181         ECN_OR_COST(BULK),
 182         TC_PRIO_BULK,
 183         ECN_OR_COST(BULK),
 184         TC_PRIO_INTERACTIVE,
 185         ECN_OR_COST(INTERACTIVE),
 186         TC_PRIO_INTERACTIVE,
 187         ECN_OR_COST(INTERACTIVE),
 188         TC_PRIO_INTERACTIVE_BULK,
 189         ECN_OR_COST(INTERACTIVE_BULK),
 190         TC_PRIO_INTERACTIVE_BULK,
 191         ECN_OR_COST(INTERACTIVE_BULK)
 192 };
 193 EXPORT_SYMBOL(ip_tos2prio);
 194
 195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 197
 198 #ifdef CONFIG_PROC_FS
 199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 200 {
 201         if (*pos)
 202                 return NULL;
 203         return SEQ_START_TOKEN;
 204 }
 205
 206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 207 {
 208         ++*pos;
 209         return NULL;
 210 }
 211
 212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 213 {
 214 }
 215
 216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 217 {
 218         if (v == SEQ_START_TOKEN)
 219                 seq_printf(seq, "%-127s\n",
 220                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 221                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 222                            "HHUptod\tSpecDst");
 223         return 0;
 224 }
 225
 226 static const struct seq_operations rt_cache_seq_ops = {
 227         .start  = rt_cache_seq_start,
 228         .next   = rt_cache_seq_next,
 229         .stop   = rt_cache_seq_stop,
 230         .show   = rt_cache_seq_show,
 231 };
 232
 233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 234 {
 235         return seq_open(file, &rt_cache_seq_ops);
 236 }
 237
 238 static const struct file_operations rt_cache_seq_fops = {
 239         .owner   = THIS_MODULE,
 240         .open    = rt_cache_seq_open,
 241         .read    = seq_read,
 242         .llseek  = seq_lseek,
 243         .release = seq_release,
 244 };
 245
 246
 247 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 248 {
 249         int cpu;
 250
 251         if (*pos == 0)
 252                 return SEQ_START_TOKEN;
 253
 254         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 255                 if (!cpu_possible(cpu))
 256                         continue;
 257                 *pos = cpu+1;
 258                 return &per_cpu(rt_cache_stat, cpu);
 259         }
 260         return NULL;
 261 }
 262
 263 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 264 {
 265         int cpu;
 266
 267         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 268                 if (!cpu_possible(cpu))
 269                         continue;
 270                 *pos = cpu+1;
 271                 return &per_cpu(rt_cache_stat, cpu);
 272         }
 273         return NULL;
 274
 275 }
 276
 277 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 278 {
 279
 280 }
 281
 282 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 283 {
 284         struct rt_cache_stat *st = v;
 285
 286         if (v == SEQ_START_TOKEN) {
 287                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288                 return 0;
 289         }
 290
 291         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 292                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 293                    dst_entries_get_slow(&ipv4_dst_ops),
 294                    0, /* st->in_hit */
 295                    st->in_slow_tot,
 296                    st->in_slow_mc,
 297                    st->in_no_route,
 298                    st->in_brd,
 299                    st->in_martian_dst,
 300                    st->in_martian_src,
 301
 302                    0, /* st->out_hit */
 303                    st->out_slow_tot,
 304                    st->out_slow_mc,
 305
 306                    0, /* st->gc_total */
 307                    0, /* st->gc_ignored */
 308                    0, /* st->gc_goal_miss */
 309                    0, /* st->gc_dst_overflow */
 310                    0, /* st->in_hlist_search */
 311                    0  /* st->out_hlist_search */
 312                 );
 313         return 0;
 314 }
 315
 316 static const struct seq_operations rt_cpu_seq_ops = {
 317         .start  = rt_cpu_seq_start,
 318         .next   = rt_cpu_seq_next,
 319         .stop   = rt_cpu_seq_stop,
 320         .show   = rt_cpu_seq_show,
 321 };
 322
 323
 324 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 325 {
 326         return seq_open(file, &rt_cpu_seq_ops);
 327 }
 328
 329 static const struct file_operations rt_cpu_seq_fops = {
 330         .owner   = THIS_MODULE,
 331         .open    = rt_cpu_seq_open,
 332         .read    = seq_read,
 333         .llseek  = seq_lseek,
 334         .release = seq_release,
 335 };
 336
 337 #ifdef CONFIG_IP_ROUTE_CLASSID
 338 static int rt_acct_proc_show(struct seq_file *m, void *v)
 339 {
 340         struct ip_rt_acct *dst, *src;
 341         unsigned int i, j;
 342
 343         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 344         if (!dst)
 345                 return -ENOMEM;
 346
 347         for_each_possible_cpu(i) {
 348                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 349                 for (j = 0; j < 256; j++) {
 350                         dst[j].o_bytes   += src[j].o_bytes;
 351                         dst[j].o_packets += src[j].o_packets;
 352                         dst[j].i_bytes   += src[j].i_bytes;
 353                         dst[j].i_packets += src[j].i_packets;
 354                 }
 355         }
 356
 357         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 358         kfree(dst);
 359         return 0;
 360 }
 361
 362 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 363 {
 364         return single_open(file, rt_acct_proc_show, NULL);
 365 }
 366
 367 static const struct file_operations rt_acct_proc_fops = {
 368         .owner          = THIS_MODULE,
 369         .open           = rt_acct_proc_open,
 370         .read           = seq_read,
 371         .llseek         = seq_lseek,
 372         .release        = single_release,
 373 };
 374 #endif
 375
 376 static int __net_init ip_rt_do_proc_init(struct net *net)
 377 {
 378         struct proc_dir_entry *pde;
 379
 380         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 381                           &rt_cache_seq_fops);
 382         if (!pde)
 383                 goto err1;
 384
 385         pde = proc_create("rt_cache", S_IRUGO,
 386                           net->proc_net_stat, &rt_cpu_seq_fops);
 387         if (!pde)
 388                 goto err2;
 389
 390 #ifdef CONFIG_IP_ROUTE_CLASSID
 391         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 392         if (!pde)
 393                 goto err3;
 394 #endif
 395         return 0;
 396
 397 #ifdef CONFIG_IP_ROUTE_CLASSID
 398 err3:
 399         remove_proc_entry("rt_cache", net->proc_net_stat);
 400 #endif
 401 err2:
 402         remove_proc_entry("rt_cache", net->proc_net);
 403 err1:
 404         return -ENOMEM;
 405 }
 406
 407 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 408 {
 409         remove_proc_entry("rt_cache", net->proc_net_stat);
 410         remove_proc_entry("rt_cache", net->proc_net);
 411 #ifdef CONFIG_IP_ROUTE_CLASSID
 412         remove_proc_entry("rt_acct", net->proc_net);
 413 #endif
 414 }
 415
 416 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 417         .init = ip_rt_do_proc_init,
 418         .exit = ip_rt_do_proc_exit,
 419 };
 420
 421 static int __init ip_rt_proc_init(void)
 422 {
 423         return register_pernet_subsys(&ip_rt_proc_ops);
 424 }
 425
 426 #else
 427 static inline int ip_rt_proc_init(void)
 428 {
 429         return 0;
 430 }
 431 #endif /* CONFIG_PROC_FS */
 432
 433 static inline bool rt_is_expired(const struct rtable *rth)
 434 {
 435         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 436 }
 437
 438 void rt_cache_flush(struct net *net)
 439 {
 440         rt_genid_bump_ipv4(net);
 441 }
 442
 443 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 444                                            struct sk_buff *skb,
 445                                            const void *daddr)
 446 {
 447         struct net_device *dev = dst->dev;
 448         const __be32 *pkey = daddr;
 449         const struct rtable *rt;
 450         struct neighbour *n;
 451
 452         rt = (const struct rtable *) dst;
 453         if (rt->rt_gateway)
 454                 pkey = (const __be32 *) &rt->rt_gateway;
 455         else if (skb)
 456                 pkey = &ip_hdr(skb)->daddr;
 457
 458         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 459         if (n)
 460                 return n;
 461         return neigh_create(&arp_tbl, pkey, dev);
 462 }
 463
 464 #define IP_IDENTS_SZ 2048u
 465
 466 static atomic_t *ip_idents __read_mostly;
 467 static u32 *ip_tstamps __read_mostly;
 468
 469 /* In order to protect privacy, we add a perturbation to identifiers
 470  * if one generator is seldom used. This makes hard for an attacker
 471  * to infer how many packets were sent between two points in time.
 472  */
 473 u32 ip_idents_reserve(u32 hash, int segs)
 474 {
 475         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 476         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 477         u32 old = ACCESS_ONCE(*p_tstamp);
 478         u32 now = (u32)jiffies;
 479         u32 delta = 0;
 480
 481         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 482                 delta = prandom_u32_max(now - old);
 483
 484         return atomic_add_return(segs + delta, p_id) - segs;
 485 }
 486 EXPORT_SYMBOL(ip_idents_reserve);
 487
 488 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 489 {
 490         static u32 ip_idents_hashrnd __read_mostly;
 491         u32 hash, id;
 492
 493         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 494
 495         hash = jhash_3words((__force u32)iph->daddr,
 496                             (__force u32)iph->saddr,
 497                             iph->protocol ^ net_hash_mix(net),
 498                             ip_idents_hashrnd);
 499         id = ip_idents_reserve(hash, segs);
 500         iph->id = htons(id);
 501 }
 502 EXPORT_SYMBOL(__ip_select_ident);
 503
 504 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 505                              const struct sock *sk,
 506                              const struct iphdr *iph,
 507                              int oif, u8 tos,
 508                              u8 prot, u32 mark, int flow_flags)
 509 {
 510         if (sk) {
 511                 const struct inet_sock *inet = inet_sk(sk);
 512
 513                 oif = sk->sk_bound_dev_if;
 514                 mark = sk->sk_mark;
 515                 tos = RT_CONN_FLAGS(sk);
 516                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 517         }
 518         flowi4_init_output(fl4, oif, mark, tos,
 519                            RT_SCOPE_UNIVERSE, prot,
 520                            flow_flags,
 521                            iph->daddr, iph->saddr, 0, 0,
 522                            sock_net_uid(net, sk));
 523 }
 524
 525 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 526                                const struct sock *sk)
 527 {
 528         const struct net *net = dev_net(skb->dev);
 529         const struct iphdr *iph = ip_hdr(skb);
 530         int oif = skb->dev->ifindex;
 531         u8 tos = RT_TOS(iph->tos);
 532         u8 prot = iph->protocol;
 533         u32 mark = skb->mark;
 534
 535         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 536 }
 537
 538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 539 {
 540         const struct inet_sock *inet = inet_sk(sk);
 541         const struct ip_options_rcu *inet_opt;
 542         __be32 daddr = inet->inet_daddr;
 543
 544         rcu_read_lock();
 545         inet_opt = rcu_dereference(inet->inet_opt);
 546         if (inet_opt && inet_opt->opt.srr)
 547                 daddr = inet_opt->opt.faddr;
 548         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 549                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 550                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 551                            inet_sk_flowi_flags(sk),
 552                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 553         rcu_read_unlock();
 554 }
 555
 556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 557                                  const struct sk_buff *skb)
 558 {
 559         if (skb)
 560                 build_skb_flow_key(fl4, skb, sk);
 561         else
 562                 build_sk_flow_key(fl4, sk);
 563 }
 564
 565 static inline void rt_free(struct rtable *rt)
 566 {
 567         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 568 }
 569
 570 static DEFINE_SPINLOCK(fnhe_lock);
 571
 572 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 573 {
 574         struct rtable *rt;
 575
 576         rt = rcu_dereference(fnhe->fnhe_rth_input);
 577         if (rt) {
 578                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 579                 rt_free(rt);
 580         }
 581         rt = rcu_dereference(fnhe->fnhe_rth_output);
 582         if (rt) {
 583                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 584                 rt_free(rt);
 585         }
 586 }
 587
 588 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 589 {
 590         struct fib_nh_exception *fnhe, *oldest;
 591
 592         oldest = rcu_dereference(hash->chain);
 593         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 594              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 595                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 596                         oldest = fnhe;
 597         }
 598         fnhe_flush_routes(oldest);
 599         return oldest;
 600 }
 601
 602 static inline u32 fnhe_hashfun(__be32 daddr)
 603 {
 604         static u32 fnhe_hashrnd __read_mostly;
 605         u32 hval;
 606
 607         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 608         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 609         return hash_32(hval, FNHE_HASH_SHIFT);
 610 }
 611
 612 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 613 {
 614         rt->rt_pmtu = fnhe->fnhe_pmtu;
 615         rt->dst.expires = fnhe->fnhe_expires;
 616
 617         if (fnhe->fnhe_gw) {
 618                 rt->rt_flags |= RTCF_REDIRECTED;
 619                 rt->rt_gateway = fnhe->fnhe_gw;
 620                 rt->rt_uses_gateway = 1;
 621         }
 622 }
 623
 624 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 625                                   u32 pmtu, unsigned long expires)
 626 {
 627         struct fnhe_hash_bucket *hash;
 628         struct fib_nh_exception *fnhe;
 629         struct rtable *rt;
 630         unsigned int i;
 631         int depth;
 632         u32 hval = fnhe_hashfun(daddr);
 633
 634         spin_lock_bh(&fnhe_lock);
 635
 636         hash = rcu_dereference(nh->nh_exceptions);
 637         if (!hash) {
 638                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 639                 if (!hash)
 640                         goto out_unlock;
 641                 rcu_assign_pointer(nh->nh_exceptions, hash);
 642         }
 643
 644         hash += hval;
 645
 646         depth = 0;
 647         for (fnhe = rcu_dereference(hash->chain); fnhe;
 648              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 649                 if (fnhe->fnhe_daddr == daddr)
 650                         break;
 651                 depth++;
 652         }
 653
 654         if (fnhe) {
 655                 if (gw)
 656                         fnhe->fnhe_gw = gw;
 657                 if (pmtu) {
 658                         fnhe->fnhe_pmtu = pmtu;
 659                         fnhe->fnhe_expires = max(1UL, expires);
 660                 }
 661                 /* Update all cached dsts too */
 662                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 663                 if (rt)
 664                         fill_route_from_fnhe(rt, fnhe);
 665                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 666                 if (rt)
 667                         fill_route_from_fnhe(rt, fnhe);
 668         } else {
 669                 if (depth > FNHE_RECLAIM_DEPTH)
 670                         fnhe = fnhe_oldest(hash);
 671                 else {
 672                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 673                         if (!fnhe)
 674                                 goto out_unlock;
 675
 676                         fnhe->fnhe_next = hash->chain;
 677                         rcu_assign_pointer(hash->chain, fnhe);
 678                 }
 679                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 680                 fnhe->fnhe_daddr = daddr;
 681                 fnhe->fnhe_gw = gw;
 682                 fnhe->fnhe_pmtu = pmtu;
 683                 fnhe->fnhe_expires = expires;
 684
 685                 /* Exception created; mark the cached routes for the nexthop
 686                  * stale, so anyone caching it rechecks if this exception
 687                  * applies to them.
 688                  */
 689                 rt = rcu_dereference(nh->nh_rth_input);
 690                 if (rt)
 691                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 692
 693                 for_each_possible_cpu(i) {
 694                         struct rtable __rcu **prt;
 695                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 696                         rt = rcu_dereference(*prt);
 697                         if (rt)
 698                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 699                 }
 700         }
 701
 702         fnhe->fnhe_stamp = jiffies;
 703
 704 out_unlock:
 705         spin_unlock_bh(&fnhe_lock);
 706 }
 707
 708 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 709                              bool kill_route)
 710 {
 711         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 712         __be32 old_gw = ip_hdr(skb)->saddr;
 713         struct net_device *dev = skb->dev;
 714         struct in_device *in_dev;
 715         struct fib_result res;
 716         struct neighbour *n;
 717         struct net *net;
 718
 719         switch (icmp_hdr(skb)->code & 7) {
 720         case ICMP_REDIR_NET:
 721         case ICMP_REDIR_NETTOS:
 722         case ICMP_REDIR_HOST:
 723         case ICMP_REDIR_HOSTTOS:
 724                 break;
 725
 726         default:
 727                 return;
 728         }
 729
 730         if (rt->rt_gateway != old_gw)
 731                 return;
 732
 733         in_dev = __in_dev_get_rcu(dev);
 734         if (!in_dev)
 735                 return;
 736
 737         net = dev_net(dev);
 738         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 739             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 740             ipv4_is_zeronet(new_gw))
 741                 goto reject_redirect;
 742
 743         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 744                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 745                         goto reject_redirect;
 746                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 747                         goto reject_redirect;
 748         } else {
 749                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 750                         goto reject_redirect;
 751         }
 752
 753         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 754         if (!n)
 755                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 756         if (!IS_ERR(n)) {
 757                 if (!(n->nud_state & NUD_VALID)) {
 758                         neigh_event_send(n, NULL);
 759                 } else {
 760                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 761                                 struct fib_nh *nh = &FIB_RES_NH(res);
 762
 763                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 764                                                 0, jiffies + ip_rt_gc_timeout);
 765                         }
 766                         if (kill_route)
 767                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 768                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 769                 }
 770                 neigh_release(n);
 771         }
 772         return;
 773
 774 reject_redirect:
 775 #ifdef CONFIG_IP_ROUTE_VERBOSE
 776         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 777                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 778                 __be32 daddr = iph->daddr;
 779                 __be32 saddr = iph->saddr;
 780
 781                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 782                                      "  Advised path = %pI4 -> %pI4\n",
 783                                      &old_gw, dev->name, &new_gw,
 784                                      &saddr, &daddr);
 785         }
 786 #endif
 787         ;
 788 }
 789
 790 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 791 {
 792         struct rtable *rt;
 793         struct flowi4 fl4;
 794         const struct iphdr *iph = (const struct iphdr *) skb->data;
 795         int oif = skb->dev->ifindex;
 796         u8 tos = RT_TOS(iph->tos);
 797         u8 prot = iph->protocol;
 798         u32 mark = skb->mark;
 799
 800         rt = (struct rtable *) dst;
 801
 802         __build_flow_key(sock_net(sk), &fl4, sk, iph, oif, tos, prot, mark, 0);
 803         __ip_do_redirect(rt, skb, &fl4, true);
 804 }
 805
 806 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 807 {
 808         struct rtable *rt = (struct rtable *)dst;
 809         struct dst_entry *ret = dst;
 810
 811         if (rt) {
 812                 if (dst->obsolete > 0) {
 813                         ip_rt_put(rt);
 814                         ret = NULL;
 815                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 816                            rt->dst.expires) {
 817                         ip_rt_put(rt);
 818                         ret = NULL;
 819                 }
 820         }
 821         return ret;
 822 }
 823
 824 /*
 825  * Algorithm:
 826  *      1. The first ip_rt_redirect_number redirects are sent
 827  *         with exponential backoff, then we stop sending them at all,
 828  *         assuming that the host ignores our redirects.
 829  *      2. If we did not see packets requiring redirects
 830  *         during ip_rt_redirect_silence, we assume that the host
 831  *         forgot redirected route and start to send redirects again.
 832  *
 833  * This algorithm is much cheaper and more intelligent than dumb load limiting
 834  * in icmp.c.
 835  *
 836  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 837  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 838  */
 839
 840 void ip_rt_send_redirect(struct sk_buff *skb)
 841 {
 842         struct rtable *rt = skb_rtable(skb);
 843         struct in_device *in_dev;
 844         struct inet_peer *peer;
 845         struct net *net;
 846         int log_martians;
 847         int vif;
 848
 849         rcu_read_lock();
 850         in_dev = __in_dev_get_rcu(rt->dst.dev);
 851         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 852                 rcu_read_unlock();
 853                 return;
 854         }
 855         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 856         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 857         rcu_read_unlock();
 858
 859         net = dev_net(rt->dst.dev);
 860         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 861         if (!peer) {
 862                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 863                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 864                 return;
 865         }
 866
 867         /* No redirected packets during ip_rt_redirect_silence;
 868          * reset the algorithm.
 869          */
 870         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 871                 peer->rate_tokens = 0;
 872
 873         /* Too many ignored redirects; do not send anything
 874          * set dst.rate_last to the last seen redirected packet.
 875          */
 876         if (peer->rate_tokens >= ip_rt_redirect_number) {
 877                 peer->rate_last = jiffies;
 878                 goto out_put_peer;
 879         }
 880
 881         /* Check for load limit; set rate_last to the latest sent
 882          * redirect.
 883          */
 884         if (peer->rate_tokens == 0 ||
 885             time_after(jiffies,
 886                        (peer->rate_last +
 887                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 888                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 889
 890                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 891                 peer->rate_last = jiffies;
 892                 ++peer->rate_tokens;
 893 #ifdef CONFIG_IP_ROUTE_VERBOSE
 894                 if (log_martians &&
 895                     peer->rate_tokens == ip_rt_redirect_number)
 896                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 897                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 898                                              &ip_hdr(skb)->daddr, &gw);
 899 #endif
 900         }
 901 out_put_peer:
 902         inet_putpeer(peer);
 903 }
 904
 905 static int ip_error(struct sk_buff *skb)
 906 {
 907         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 908         struct rtable *rt = skb_rtable(skb);
 909         struct inet_peer *peer;
 910         unsigned long now;
 911         struct net *net;
 912         bool send;
 913         int code;
 914
 915         /* IP on this device is disabled. */
 916         if (!in_dev)
 917                 goto out;
 918
 919         net = dev_net(rt->dst.dev);
 920         if (!IN_DEV_FORWARD(in_dev)) {
 921                 switch (rt->dst.error) {
 922                 case EHOSTUNREACH:
 923                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 924                         break;
 925
 926                 case ENETUNREACH:
 927                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 928                         break;
 929                 }
 930                 goto out;
 931         }
 932
 933         switch (rt->dst.error) {
 934         case EINVAL:
 935         default:
 936                 goto out;
 937         case EHOSTUNREACH:
 938                 code = ICMP_HOST_UNREACH;
 939                 break;
 940         case ENETUNREACH:
 941                 code = ICMP_NET_UNREACH;
 942                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 943                 break;
 944         case EACCES:
 945                 code = ICMP_PKT_FILTERED;
 946                 break;
 947         }
 948
 949         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 950                                l3mdev_master_ifindex(skb->dev), 1);
 951
 952         send = true;
 953         if (peer) {
 954                 now = jiffies;
 955                 peer->rate_tokens += now - peer->rate_last;
 956                 if (peer->rate_tokens > ip_rt_error_burst)
 957                         peer->rate_tokens = ip_rt_error_burst;
 958                 peer->rate_last = now;
 959                 if (peer->rate_tokens >= ip_rt_error_cost)
 960                         peer->rate_tokens -= ip_rt_error_cost;
 961                 else
 962                         send = false;
 963                 inet_putpeer(peer);
 964         }
 965         if (send)
 966                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 967
 968 out:    kfree_skb(skb);
 969         return 0;
 970 }
 971
 972 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 973 {
 974         struct dst_entry *dst = &rt->dst;
 975         struct fib_result res;
 976
 977         if (dst_metric_locked(dst, RTAX_MTU))
 978                 return;
 979
 980         if (ipv4_mtu(dst) < mtu)
 981                 return;
 982
 983         if (mtu < ip_rt_min_pmtu)
 984                 mtu = ip_rt_min_pmtu;
 985
 986         if (rt->rt_pmtu == mtu &&
 987             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 988                 return;
 989
 990         rcu_read_lock();
 991         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 992                 struct fib_nh *nh = &FIB_RES_NH(res);
 993
 994                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 995                                       jiffies + ip_rt_mtu_expires);
 996         }
 997         rcu_read_unlock();
 998 }
 999
1000 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1001                               struct sk_buff *skb, u32 mtu)
1002 {
1003         struct rtable *rt = (struct rtable *) dst;
1004         struct flowi4 fl4;
1005
1006         ip_rt_build_flow_key(&fl4, sk, skb);
1007         __ip_rt_update_pmtu(rt, &fl4, mtu);
1008 }
1009
1010 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1011                       int oif, u32 mark, u8 protocol, int flow_flags)
1012 {
1013         const struct iphdr *iph = (const struct iphdr *) skb->data;
1014         struct flowi4 fl4;
1015         struct rtable *rt;
1016
1017         if (!mark)
1018                 mark = IP4_REPLY_MARK(net, skb->mark);
1019
1020         __build_flow_key(net, &fl4, NULL, iph, oif,
1021                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1022         rt = __ip_route_output_key(net, &fl4);
1023         if (!IS_ERR(rt)) {
1024                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1025                 ip_rt_put(rt);
1026         }
1027 }
1028 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1029
1030 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1031 {
1032         const struct iphdr *iph = (const struct iphdr *) skb->data;
1033         struct flowi4 fl4;
1034         struct rtable *rt;
1035
1036         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1037
1038         if (!fl4.flowi4_mark)
1039                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1040
1041         rt = __ip_route_output_key(sock_net(sk), &fl4);
1042         if (!IS_ERR(rt)) {
1043                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1044                 ip_rt_put(rt);
1045         }
1046 }
1047
1048 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1049 {
1050         const struct iphdr *iph = (const struct iphdr *) skb->data;
1051         struct flowi4 fl4;
1052         struct rtable *rt;
1053         struct dst_entry *odst = NULL;
1054         bool new = false;
1055         struct net *net = sock_net(sk);
1056
1057         bh_lock_sock(sk);
1058
1059         if (!ip_sk_accept_pmtu(sk))
1060                 goto out;
1061
1062         odst = sk_dst_get(sk);
1063
1064         if (sock_owned_by_user(sk) || !odst) {
1065                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1066                 goto out;
1067         }
1068
1069         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1070
1071         rt = (struct rtable *)odst;
1072         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1073                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1074                 if (IS_ERR(rt))
1075                         goto out;
1076
1077                 new = true;
1078         }
1079
1080         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1081
1082         if (!dst_check(&rt->dst, 0)) {
1083                 if (new)
1084                         dst_release(&rt->dst);
1085
1086                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1087                 if (IS_ERR(rt))
1088                         goto out;
1089
1090                 new = true;
1091         }
1092
1093         if (new)
1094                 sk_dst_set(sk, &rt->dst);
1095
1096 out:
1097         bh_unlock_sock(sk);
1098         dst_release(odst);
1099 }
1100 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1101
1102 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1103                    int oif, u32 mark, u8 protocol, int flow_flags)
1104 {
1105         const struct iphdr *iph = (const struct iphdr *) skb->data;
1106         struct flowi4 fl4;
1107         struct rtable *rt;
1108
1109         __build_flow_key(net, &fl4, NULL, iph, oif,
1110                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1111         rt = __ip_route_output_key(net, &fl4);
1112         if (!IS_ERR(rt)) {
1113                 __ip_do_redirect(rt, skb, &fl4, false);
1114                 ip_rt_put(rt);
1115         }
1116 }
1117 EXPORT_SYMBOL_GPL(ipv4_redirect);
1118
1119 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1120 {
1121         const struct iphdr *iph = (const struct iphdr *) skb->data;
1122         struct flowi4 fl4;
1123         struct rtable *rt;
1124         struct net *net = sock_net(sk);
1125
1126         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1127         rt = __ip_route_output_key(net, &fl4);
1128         if (!IS_ERR(rt)) {
1129                 __ip_do_redirect(rt, skb, &fl4, false);
1130                 ip_rt_put(rt);
1131         }
1132 }
1133 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1134
1135 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1136 {
1137         struct rtable *rt = (struct rtable *) dst;
1138
1139         /* All IPV4 dsts are created with ->obsolete set to the value
1140          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1141          * into this function always.
1142          *
1143          * When a PMTU/redirect information update invalidates a route,
1144          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1145          * DST_OBSOLETE_DEAD by dst_free().
1146          */
1147         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1148                 return NULL;
1149         return dst;
1150 }
1151
1152 static void ipv4_link_failure(struct sk_buff *skb)
1153 {
1154         struct rtable *rt;
1155
1156         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1157
1158         rt = skb_rtable(skb);
1159         if (rt)
1160                 dst_set_expires(&rt->dst, 0);
1161 }
1162
1163 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1164 {
1165         pr_debug("%s: %pI4 -> %pI4, %s\n",
1166                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1167                  skb->dev ? skb->dev->name : "?");
1168         kfree_skb(skb);
1169         WARN_ON(1);
1170         return 0;
1171 }
1172
1173 /*
1174    We do not cache source address of outgoing interface,
1175    because it is used only by IP RR, TS and SRR options,
1176    so that it out of fast path.
1177
1178    BTW remember: "addr" is allowed to be not aligned
1179    in IP options!
1180  */
1181
1182 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1183 {
1184         __be32 src;
1185
1186         if (rt_is_output_route(rt))
1187                 src = ip_hdr(skb)->saddr;
1188         else {
1189                 struct fib_result res;
1190                 struct flowi4 fl4;
1191                 struct iphdr *iph;
1192
1193                 iph = ip_hdr(skb);
1194
1195                 memset(&fl4, 0, sizeof(fl4));
1196                 fl4.daddr = iph->daddr;
1197                 fl4.saddr = iph->saddr;
1198                 fl4.flowi4_tos = RT_TOS(iph->tos);
1199                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1200                 fl4.flowi4_iif = skb->dev->ifindex;
1201                 fl4.flowi4_mark = skb->mark;
1202
1203                 rcu_read_lock();
1204                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1205                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1206                 else
1207                         src = inet_select_addr(rt->dst.dev,
1208                                                rt_nexthop(rt, iph->daddr),
1209                                                RT_SCOPE_UNIVERSE);
1210                 rcu_read_unlock();
1211         }
1212         memcpy(addr, &src, 4);
1213 }
1214
1215 #ifdef CONFIG_IP_ROUTE_CLASSID
1216 static void set_class_tag(struct rtable *rt, u32 tag)
1217 {
1218         if (!(rt->dst.tclassid & 0xFFFF))
1219                 rt->dst.tclassid |= tag & 0xFFFF;
1220         if (!(rt->dst.tclassid & 0xFFFF0000))
1221                 rt->dst.tclassid |= tag & 0xFFFF0000;
1222 }
1223 #endif
1224
1225 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1226 {
1227         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1228
1229         if (advmss == 0) {
1230                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1231                                ip_rt_min_advmss);
1232                 if (advmss > 65535 - 40)
1233                         advmss = 65535 - 40;
1234         }
1235         return advmss;
1236 }
1237
1238 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1239 {
1240         const struct rtable *rt = (const struct rtable *) dst;
1241         unsigned int mtu = rt->rt_pmtu;
1242
1243         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1244                 mtu = dst_metric_raw(dst, RTAX_MTU);
1245
1246         if (mtu)
1247                 return mtu;
1248
1249         mtu = dst->dev->mtu;
1250
1251         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1252                 if (rt->rt_uses_gateway && mtu > 576)
1253                         mtu = 576;
1254         }
1255
1256         return min_t(unsigned int, mtu, IP_MAX_MTU);
1257 }
1258
1259 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1260 {
1261         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1262         struct fib_nh_exception *fnhe;
1263         u32 hval;
1264
1265         if (!hash)
1266                 return NULL;
1267
1268         hval = fnhe_hashfun(daddr);
1269
1270         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1271              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1272                 if (fnhe->fnhe_daddr == daddr)
1273                         return fnhe;
1274         }
1275         return NULL;
1276 }
1277
1278 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1279                               __be32 daddr)
1280 {
1281         bool ret = false;
1282
1283         spin_lock_bh(&fnhe_lock);
1284
1285         if (daddr == fnhe->fnhe_daddr) {
1286                 struct rtable __rcu **porig;
1287                 struct rtable *orig;
1288                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1289
1290                 if (rt_is_input_route(rt))
1291                         porig = &fnhe->fnhe_rth_input;
1292                 else
1293                         porig = &fnhe->fnhe_rth_output;
1294                 orig = rcu_dereference(*porig);
1295
1296                 if (fnhe->fnhe_genid != genid) {
1297                         fnhe->fnhe_genid = genid;
1298                         fnhe->fnhe_gw = 0;
1299                         fnhe->fnhe_pmtu = 0;
1300                         fnhe->fnhe_expires = 0;
1301                         fnhe_flush_routes(fnhe);
1302                         orig = NULL;
1303                 }
1304                 fill_route_from_fnhe(rt, fnhe);
1305                 if (!rt->rt_gateway)
1306                         rt->rt_gateway = daddr;
1307
1308                 if (!(rt->dst.flags & DST_NOCACHE)) {
1309                         rcu_assign_pointer(*porig, rt);
1310                         if (orig)
1311                                 rt_free(orig);
1312                         ret = true;
1313                 }
1314
1315                 fnhe->fnhe_stamp = jiffies;
1316         }
1317         spin_unlock_bh(&fnhe_lock);
1318
1319         return ret;
1320 }
1321
1322 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1323 {
1324         struct rtable *orig, *prev, **p;
1325         bool ret = true;
1326
1327         if (rt_is_input_route(rt)) {
1328                 p = (struct rtable **)&nh->nh_rth_input;
1329         } else {
1330                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1331         }
1332         orig = *p;
1333
1334         prev = cmpxchg(p, orig, rt);
1335         if (prev == orig) {
1336                 if (orig)
1337                         rt_free(orig);
1338         } else
1339                 ret = false;
1340
1341         return ret;
1342 }
1343
1344 struct uncached_list {
1345         spinlock_t              lock;
1346         struct list_head        head;
1347 };
1348
1349 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1350
1351 static void rt_add_uncached_list(struct rtable *rt)
1352 {
1353         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1354
1355         rt->rt_uncached_list = ul;
1356
1357         spin_lock_bh(&ul->lock);
1358         list_add_tail(&rt->rt_uncached, &ul->head);
1359         spin_unlock_bh(&ul->lock);
1360 }
1361
1362 static void ipv4_dst_destroy(struct dst_entry *dst)
1363 {
1364         struct rtable *rt = (struct rtable *) dst;
1365
1366         if (!list_empty(&rt->rt_uncached)) {
1367                 struct uncached_list *ul = rt->rt_uncached_list;
1368
1369                 spin_lock_bh(&ul->lock);
1370                 list_del(&rt->rt_uncached);
1371                 spin_unlock_bh(&ul->lock);
1372         }
1373 }
1374
1375 void rt_flush_dev(struct net_device *dev)
1376 {
1377         struct net *net = dev_net(dev);
1378         struct rtable *rt;
1379         int cpu;
1380
1381         for_each_possible_cpu(cpu) {
1382                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1383
1384                 spin_lock_bh(&ul->lock);
1385                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1386                         if (rt->dst.dev != dev)
1387                                 continue;
1388                         rt->dst.dev = net->loopback_dev;
1389                         dev_hold(rt->dst.dev);
1390                         dev_put(dev);
1391                 }
1392                 spin_unlock_bh(&ul->lock);
1393         }
1394 }
1395
1396 static bool rt_cache_valid(const struct rtable *rt)
1397 {
1398         return  rt &&
1399                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1400                 !rt_is_expired(rt);
1401 }
1402
1403 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1404                            const struct fib_result *res,
1405                            struct fib_nh_exception *fnhe,
1406                            struct fib_info *fi, u16 type, u32 itag)
1407 {
1408         bool cached = false;
1409
1410         if (fi) {
1411                 struct fib_nh *nh = &FIB_RES_NH(*res);
1412
1413                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1414                         rt->rt_gateway = nh->nh_gw;
1415                         rt->rt_uses_gateway = 1;
1416                 }
1417                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1418 #ifdef CONFIG_IP_ROUTE_CLASSID
1419                 rt->dst.tclassid = nh->nh_tclassid;
1420 #endif
1421                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1422                 if (unlikely(fnhe))
1423                         cached = rt_bind_exception(rt, fnhe, daddr);
1424                 else if (!(rt->dst.flags & DST_NOCACHE))
1425                         cached = rt_cache_route(nh, rt);
1426                 if (unlikely(!cached)) {
1427                         /* Routes we intend to cache in nexthop exception or
1428                          * FIB nexthop have the DST_NOCACHE bit clear.
1429                          * However, if we are unsuccessful at storing this
1430                          * route into the cache we really need to set it.
1431                          */
1432                         rt->dst.flags |= DST_NOCACHE;
1433                         if (!rt->rt_gateway)
1434                                 rt->rt_gateway = daddr;
1435                         rt_add_uncached_list(rt);
1436                 }
1437         } else
1438                 rt_add_uncached_list(rt);
1439
1440 #ifdef CONFIG_IP_ROUTE_CLASSID
1441 #ifdef CONFIG_IP_MULTIPLE_TABLES
1442         set_class_tag(rt, res->tclassid);
1443 #endif
1444         set_class_tag(rt, itag);
1445 #endif
1446 }
1447
1448 static struct rtable *rt_dst_alloc(struct net_device *dev,
1449                                    unsigned int flags, u16 type,
1450                                    bool nopolicy, bool noxfrm, bool will_cache)
1451 {
1452         struct rtable *rt;
1453
1454         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1455                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1456                        (nopolicy ? DST_NOPOLICY : 0) |
1457                        (noxfrm ? DST_NOXFRM : 0));
1458
1459         if (rt) {
1460                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1461                 rt->rt_flags = flags;
1462                 rt->rt_type = type;
1463                 rt->rt_is_input = 0;
1464                 rt->rt_iif = 0;
1465                 rt->rt_pmtu = 0;
1466                 rt->rt_gateway = 0;
1467                 rt->rt_uses_gateway = 0;
1468                 rt->rt_table_id = 0;
1469                 INIT_LIST_HEAD(&rt->rt_uncached);
1470
1471                 rt->dst.output = ip_output;
1472                 if (flags & RTCF_LOCAL)
1473                         rt->dst.input = ip_local_deliver;
1474         }
1475
1476         return rt;
1477 }
1478
1479 /* called in rcu_read_lock() section */
1480 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1481                                 u8 tos, struct net_device *dev, int our)
1482 {
1483         struct rtable *rth;
1484         struct in_device *in_dev = __in_dev_get_rcu(dev);
1485         unsigned int flags = RTCF_MULTICAST;
1486         u32 itag = 0;
1487         int err;
1488
1489         /* Primary sanity checks. */
1490
1491         if (!in_dev)
1492                 return -EINVAL;
1493
1494         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1495             skb->protocol != htons(ETH_P_IP))
1496                 goto e_inval;
1497
1498         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1499                 goto e_inval;
1500
1501         if (ipv4_is_zeronet(saddr)) {
1502                 if (!ipv4_is_local_multicast(daddr))
1503                         goto e_inval;
1504         } else {
1505                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1506                                           in_dev, &itag);
1507                 if (err < 0)
1508                         goto e_err;
1509         }
1510         if (our)
1511                 flags |= RTCF_LOCAL;
1512
1513         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1514                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1515         if (!rth)
1516                 goto e_nobufs;
1517
1518 #ifdef CONFIG_IP_ROUTE_CLASSID
1519         rth->dst.tclassid = itag;
1520 #endif
1521         rth->dst.output = ip_rt_bug;
1522         rth->rt_is_input= 1;
1523
1524 #ifdef CONFIG_IP_MROUTE
1525         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1526                 rth->dst.input = ip_mr_input;
1527 #endif
1528         RT_CACHE_STAT_INC(in_slow_mc);
1529
1530         skb_dst_set(skb, &rth->dst);
1531         return 0;
1532
1533 e_nobufs:
1534         return -ENOBUFS;
1535 e_inval:
1536         return -EINVAL;
1537 e_err:
1538         return err;
1539 }
1540
1541
1542 static void ip_handle_martian_source(struct net_device *dev,
1543                                      struct in_device *in_dev,
1544                                      struct sk_buff *skb,
1545                                      __be32 daddr,
1546                                      __be32 saddr)
1547 {
1548         RT_CACHE_STAT_INC(in_martian_src);
1549 #ifdef CONFIG_IP_ROUTE_VERBOSE
1550         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1551                 /*
1552                  *      RFC1812 recommendation, if source is martian,
1553                  *      the only hint is MAC header.
1554                  */
1555                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1556                         &daddr, &saddr, dev->name);
1557                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1558                         print_hex_dump(KERN_WARNING, "ll header: ",
1559                                        DUMP_PREFIX_OFFSET, 16, 1,
1560                                        skb_mac_header(skb),
1561                                        dev->hard_header_len, true);
1562                 }
1563         }
1564 #endif
1565 }
1566
1567 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1568 {
1569         struct fnhe_hash_bucket *hash;
1570         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1571         u32 hval = fnhe_hashfun(daddr);
1572
1573         spin_lock_bh(&fnhe_lock);
1574
1575         hash = rcu_dereference_protected(nh->nh_exceptions,
1576                                          lockdep_is_held(&fnhe_lock));
1577         hash += hval;
1578
1579         fnhe_p = &hash->chain;
1580         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1581         while (fnhe) {
1582                 if (fnhe->fnhe_daddr == daddr) {
1583                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1584                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1585                         fnhe_flush_routes(fnhe);
1586                         kfree_rcu(fnhe, rcu);
1587                         break;
1588                 }
1589                 fnhe_p = &fnhe->fnhe_next;
1590                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1591                                                  lockdep_is_held(&fnhe_lock));
1592         }
1593
1594         spin_unlock_bh(&fnhe_lock);
1595 }
1596
1597 /* called in rcu_read_lock() section */
1598 static int __mkroute_input(struct sk_buff *skb,
1599                            const struct fib_result *res,
1600                            struct in_device *in_dev,
1601                            __be32 daddr, __be32 saddr, u32 tos)
1602 {
1603         struct fib_nh_exception *fnhe;
1604         struct rtable *rth;
1605         int err;
1606         struct in_device *out_dev;
1607         bool do_cache;
1608         u32 itag = 0;
1609
1610         /* get a working reference to the output device */
1611         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1612         if (!out_dev) {
1613                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1614                 return -EINVAL;
1615         }
1616
1617         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1618                                   in_dev->dev, in_dev, &itag);
1619         if (err < 0) {
1620                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1621                                          saddr);
1622
1623                 goto cleanup;
1624         }
1625
1626         do_cache = res->fi && !itag;
1627         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1628             skb->protocol == htons(ETH_P_IP) &&
1629             (IN_DEV_SHARED_MEDIA(out_dev) ||
1630              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1631                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1632
1633         if (skb->protocol != htons(ETH_P_IP)) {
1634                 /* Not IP (i.e. ARP). Do not create route, if it is
1635                  * invalid for proxy arp. DNAT routes are always valid.
1636                  *
1637                  * Proxy arp feature have been extended to allow, ARP
1638                  * replies back to the same interface, to support
1639                  * Private VLAN switch technologies. See arp.c.
1640                  */
1641                 if (out_dev == in_dev &&
1642                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1643                         err = -EINVAL;
1644                         goto cleanup;
1645                 }
1646         }
1647
1648         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1649         if (do_cache) {
1650                 if (fnhe) {
1651                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1652                         if (rth && rth->dst.expires &&
1653                             time_after(jiffies, rth->dst.expires)) {
1654                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1655                                 fnhe = NULL;
1656                         } else {
1657                                 goto rt_cache;
1658                         }
1659                 }
1660
1661                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1662
1663 rt_cache:
1664                 if (rt_cache_valid(rth)) {
1665                         skb_dst_set_noref(skb, &rth->dst);
1666                         goto out;
1667                 }
1668         }
1669
1670         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1671                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1672                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1673         if (!rth) {
1674                 err = -ENOBUFS;
1675                 goto cleanup;
1676         }
1677
1678         rth->rt_is_input = 1;
1679         if (res->table)
1680                 rth->rt_table_id = res->table->tb_id;
1681         RT_CACHE_STAT_INC(in_slow_tot);
1682
1683         rth->dst.input = ip_forward;
1684
1685         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1686         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1687                 rth->dst.lwtstate->orig_output = rth->dst.output;
1688                 rth->dst.output = lwtunnel_output;
1689         }
1690         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1691                 rth->dst.lwtstate->orig_input = rth->dst.input;
1692                 rth->dst.input = lwtunnel_input;
1693         }
1694         skb_dst_set(skb, &rth->dst);
1695 out:
1696         err = 0;
1697  cleanup:
1698         return err;
1699 }
1700
1701 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1702
1703 /* To make ICMP packets follow the right flow, the multipath hash is
1704  * calculated from the inner IP addresses in reverse order.
1705  */
1706 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1707 {
1708         const struct iphdr *outer_iph = ip_hdr(skb);
1709         struct icmphdr _icmph;
1710         const struct icmphdr *icmph;
1711         struct iphdr _inner_iph;
1712         const struct iphdr *inner_iph;
1713
1714         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1715                 goto standard_hash;
1716
1717         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1718                                    &_icmph);
1719         if (!icmph)
1720                 goto standard_hash;
1721
1722         if (icmph->type != ICMP_DEST_UNREACH &&
1723             icmph->type != ICMP_REDIRECT &&
1724             icmph->type != ICMP_TIME_EXCEEDED &&
1725             icmph->type != ICMP_PARAMETERPROB) {
1726                 goto standard_hash;
1727         }
1728
1729         inner_iph = skb_header_pointer(skb,
1730                                        outer_iph->ihl * 4 + sizeof(_icmph),
1731                                        sizeof(_inner_iph), &_inner_iph);
1732         if (!inner_iph)
1733                 goto standard_hash;
1734
1735         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1736
1737 standard_hash:
1738         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1739 }
1740
1741 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1742
1743 static int ip_mkroute_input(struct sk_buff *skb,
1744                             struct fib_result *res,
1745                             const struct flowi4 *fl4,
1746                             struct in_device *in_dev,
1747                             __be32 daddr, __be32 saddr, u32 tos)
1748 {
1749 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1750         if (res->fi && res->fi->fib_nhs > 1) {
1751                 int h;
1752
1753                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1754                         h = ip_multipath_icmp_hash(skb);
1755                 else
1756                         h = fib_multipath_hash(saddr, daddr);
1757                 fib_select_multipath(res, h);
1758         }
1759 #endif
1760
1761         /* create a routing cache entry */
1762         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1763 }
1764
1765 /*
1766  *      NOTE. We drop all the packets that has local source
1767  *      addresses, because every properly looped back packet
1768  *      must have correct destination already attached by output routine.
1769  *
1770  *      Such approach solves two big problems:
1771  *      1. Not simplex devices are handled properly.
1772  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1773  *      called with rcu_read_lock()
1774  */
1775
1776 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1777                                u8 tos, struct net_device *dev)
1778 {
1779         struct fib_result res;
1780         struct in_device *in_dev = __in_dev_get_rcu(dev);
1781         struct ip_tunnel_info *tun_info;
1782         struct flowi4   fl4;
1783         unsigned int    flags = 0;
1784         u32             itag = 0;
1785         struct rtable   *rth;
1786         int             err = -EINVAL;
1787         struct net    *net = dev_net(dev);
1788         bool do_cache;
1789
1790         /* IP on this device is disabled. */
1791
1792         if (!in_dev)
1793                 goto out;
1794
1795         /* Check for the most weird martians, which can be not detected
1796            by fib_lookup.
1797          */
1798
1799         tun_info = skb_tunnel_info(skb);
1800         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1801                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1802         else
1803                 fl4.flowi4_tun_key.tun_id = 0;
1804         skb_dst_drop(skb);
1805
1806         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1807                 goto martian_source;
1808
1809         res.fi = NULL;
1810         res.table = NULL;
1811         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1812                 goto brd_input;
1813
1814         /* Accept zero addresses only to limited broadcast;
1815          * I even do not know to fix it or not. Waiting for complains :-)
1816          */
1817         if (ipv4_is_zeronet(saddr))
1818                 goto martian_source;
1819
1820         if (ipv4_is_zeronet(daddr))
1821                 goto martian_destination;
1822
1823         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1824          * and call it once if daddr or/and saddr are loopback addresses
1825          */
1826         if (ipv4_is_loopback(daddr)) {
1827                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1828                         goto martian_destination;
1829         } else if (ipv4_is_loopback(saddr)) {
1830                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1831                         goto martian_source;
1832         }
1833
1834         /*
1835          *      Now we are ready to route packet.
1836          */
1837         fl4.flowi4_oif = 0;
1838         fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1839         fl4.flowi4_mark = skb->mark;
1840         fl4.flowi4_tos = tos;
1841         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1842         fl4.flowi4_flags = 0;
1843         fl4.daddr = daddr;
1844         fl4.saddr = saddr;
1845         err = fib_lookup(net, &fl4, &res, 0);
1846         if (err != 0) {
1847                 if (!IN_DEV_FORWARD(in_dev))
1848                         err = -EHOSTUNREACH;
1849                 goto no_route;
1850         }
1851
1852         if (res.type == RTN_BROADCAST)
1853                 goto brd_input;
1854
1855         if (res.type == RTN_LOCAL) {
1856                 err = fib_validate_source(skb, saddr, daddr, tos,
1857                                           0, dev, in_dev, &itag);
1858                 if (err < 0)
1859                         goto martian_source;
1860                 goto local_input;
1861         }
1862
1863         if (!IN_DEV_FORWARD(in_dev)) {
1864                 err = -EHOSTUNREACH;
1865                 goto no_route;
1866         }
1867         if (res.type != RTN_UNICAST)
1868                 goto martian_destination;
1869
1870         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1871 out:    return err;
1872
1873 brd_input:
1874         if (skb->protocol != htons(ETH_P_IP))
1875                 goto e_inval;
1876
1877         if (!ipv4_is_zeronet(saddr)) {
1878                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1879                                           in_dev, &itag);
1880                 if (err < 0)
1881                         goto martian_source;
1882         }
1883         flags |= RTCF_BROADCAST;
1884         res.type = RTN_BROADCAST;
1885         RT_CACHE_STAT_INC(in_brd);
1886
1887 local_input:
1888         do_cache = false;
1889         if (res.fi) {
1890                 if (!itag) {
1891                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1892                         if (rt_cache_valid(rth)) {
1893                                 skb_dst_set_noref(skb, &rth->dst);
1894                                 err = 0;
1895                                 goto out;
1896                         }
1897                         do_cache = true;
1898                 }
1899         }
1900
1901         rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1902                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1903         if (!rth)
1904                 goto e_nobufs;
1905
1906         rth->dst.output= ip_rt_bug;
1907 #ifdef CONFIG_IP_ROUTE_CLASSID
1908         rth->dst.tclassid = itag;
1909 #endif
1910         rth->rt_is_input = 1;
1911         if (res.table)
1912                 rth->rt_table_id = res.table->tb_id;
1913
1914         RT_CACHE_STAT_INC(in_slow_tot);
1915         if (res.type == RTN_UNREACHABLE) {
1916                 rth->dst.input= ip_error;
1917                 rth->dst.error= -err;
1918                 rth->rt_flags   &= ~RTCF_LOCAL;
1919         }
1920         if (do_cache) {
1921                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1922                         rth->dst.flags |= DST_NOCACHE;
1923                         rt_add_uncached_list(rth);
1924                 }
1925         }
1926         skb_dst_set(skb, &rth->dst);
1927         err = 0;
1928         goto out;
1929
1930 no_route:
1931         RT_CACHE_STAT_INC(in_no_route);
1932         res.type = RTN_UNREACHABLE;
1933         res.fi = NULL;
1934         res.table = NULL;
1935         goto local_input;
1936
1937         /*
1938          *      Do not cache martian addresses: they should be logged (RFC1812)
1939          */
1940 martian_destination:
1941         RT_CACHE_STAT_INC(in_martian_dst);
1942 #ifdef CONFIG_IP_ROUTE_VERBOSE
1943         if (IN_DEV_LOG_MARTIANS(in_dev))
1944                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1945                                      &daddr, &saddr, dev->name);
1946 #endif
1947
1948 e_inval:
1949         err = -EINVAL;
1950         goto out;
1951
1952 e_nobufs:
1953         err = -ENOBUFS;
1954         goto out;
1955
1956 martian_source:
1957         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1958         goto out;
1959 }
1960
1961 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1962                          u8 tos, struct net_device *dev)
1963 {
1964         int res;
1965
1966         rcu_read_lock();
1967
1968         /* Multicast recognition logic is moved from route cache to here.
1969            The problem was that too many Ethernet cards have broken/missing
1970            hardware multicast filters :-( As result the host on multicasting
1971            network acquires a lot of useless route cache entries, sort of
1972            SDR messages from all the world. Now we try to get rid of them.
1973            Really, provided software IP multicast filter is organized
1974            reasonably (at least, hashed), it does not result in a slowdown
1975            comparing with route cache reject entries.
1976            Note, that multicast routers are not affected, because
1977            route cache entry is created eventually.
1978          */
1979         if (ipv4_is_multicast(daddr)) {
1980                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1981
1982                 if (in_dev) {
1983                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1984                                                   ip_hdr(skb)->protocol);
1985                         if (our
1986 #ifdef CONFIG_IP_MROUTE
1987                                 ||
1988                             (!ipv4_is_local_multicast(daddr) &&
1989                              IN_DEV_MFORWARD(in_dev))
1990 #endif
1991                            ) {
1992                                 int res = ip_route_input_mc(skb, daddr, saddr,
1993                                                             tos, dev, our);
1994                                 rcu_read_unlock();
1995                                 return res;
1996                         }
1997                 }
1998                 rcu_read_unlock();
1999                 return -EINVAL;
2000         }
2001         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2002         rcu_read_unlock();
2003         return res;
2004 }
2005 EXPORT_SYMBOL(ip_route_input_noref);
2006
2007 /* called with rcu_read_lock() */
2008 static struct rtable *__mkroute_output(const struct fib_result *res,
2009                                        const struct flowi4 *fl4, int orig_oif,
2010                                        struct net_device *dev_out,
2011                                        unsigned int flags)
2012 {
2013         struct fib_info *fi = res->fi;
2014         struct fib_nh_exception *fnhe;
2015         struct in_device *in_dev;
2016         u16 type = res->type;
2017         struct rtable *rth;
2018         bool do_cache;
2019
2020         in_dev = __in_dev_get_rcu(dev_out);
2021         if (!in_dev)
2022                 return ERR_PTR(-EINVAL);
2023
2024         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2025                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2026                         return ERR_PTR(-EINVAL);
2027
2028         if (ipv4_is_lbcast(fl4->daddr))
2029                 type = RTN_BROADCAST;
2030         else if (ipv4_is_multicast(fl4->daddr))
2031                 type = RTN_MULTICAST;
2032         else if (ipv4_is_zeronet(fl4->daddr))
2033                 return ERR_PTR(-EINVAL);
2034
2035         if (dev_out->flags & IFF_LOOPBACK)
2036                 flags |= RTCF_LOCAL;
2037
2038         do_cache = true;
2039         if (type == RTN_BROADCAST) {
2040                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2041                 fi = NULL;
2042         } else if (type == RTN_MULTICAST) {
2043                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2044                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2045                                      fl4->flowi4_proto))
2046                         flags &= ~RTCF_LOCAL;
2047                 else
2048                         do_cache = false;
2049                 /* If multicast route do not exist use
2050                  * default one, but do not gateway in this case.
2051                  * Yes, it is hack.
2052                  */
2053                 if (fi && res->prefixlen < 4)
2054                         fi = NULL;
2055         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2056                    (orig_oif != dev_out->ifindex)) {
2057                 /* For local routes that require a particular output interface
2058                  * we do not want to cache the result.  Caching the result
2059                  * causes incorrect behaviour when there are multiple source
2060                  * addresses on the interface, the end result being that if the
2061                  * intended recipient is waiting on that interface for the
2062                  * packet he won't receive it because it will be delivered on
2063                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2064                  * be set to the loopback interface as well.
2065                  */
2066                 fi = NULL;
2067         }
2068
2069         fnhe = NULL;
2070         do_cache &= fi != NULL;
2071         if (do_cache) {
2072                 struct rtable __rcu **prth;
2073                 struct fib_nh *nh = &FIB_RES_NH(*res);
2074
2075                 fnhe = find_exception(nh, fl4->daddr);
2076                 if (fnhe) {
2077                         prth = &fnhe->fnhe_rth_output;
2078                         rth = rcu_dereference(*prth);
2079                         if (rth && rth->dst.expires &&
2080                             time_after(jiffies, rth->dst.expires)) {
2081                                 ip_del_fnhe(nh, fl4->daddr);
2082                                 fnhe = NULL;
2083                         } else {
2084                                 goto rt_cache;
2085                         }
2086                 }
2087
2088                 if (unlikely(fl4->flowi4_flags &
2089                              FLOWI_FLAG_KNOWN_NH &&
2090                              !(nh->nh_gw &&
2091                                nh->nh_scope == RT_SCOPE_LINK))) {
2092                         do_cache = false;
2093                         goto add;
2094                 }
2095                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2096                 rth = rcu_dereference(*prth);
2097
2098 rt_cache:
2099                 if (rt_cache_valid(rth)) {
2100                         dst_hold(&rth->dst);
2101                         return rth;
2102                 }
2103         }
2104
2105 add:
2106         rth = rt_dst_alloc(dev_out, flags, type,
2107                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2108                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2109                            do_cache);
2110         if (!rth)
2111                 return ERR_PTR(-ENOBUFS);
2112
2113         rth->rt_iif     = orig_oif ? : 0;
2114         if (res->table)
2115                 rth->rt_table_id = res->table->tb_id;
2116
2117         RT_CACHE_STAT_INC(out_slow_tot);
2118
2119         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2120                 if (flags & RTCF_LOCAL &&
2121                     !(dev_out->flags & IFF_LOOPBACK)) {
2122                         rth->dst.output = ip_mc_output;
2123                         RT_CACHE_STAT_INC(out_slow_mc);
2124                 }
2125 #ifdef CONFIG_IP_MROUTE
2126                 if (type == RTN_MULTICAST) {
2127                         if (IN_DEV_MFORWARD(in_dev) &&
2128                             !ipv4_is_local_multicast(fl4->daddr)) {
2129                                 rth->dst.input = ip_mr_input;
2130                                 rth->dst.output = ip_mc_output;
2131                         }
2132                 }
2133 #endif
2134         }
2135
2136         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2137         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2138                 rth->dst.output = lwtunnel_output;
2139
2140         return rth;
2141 }
2142
2143 /*
2144  * Major route resolver routine.
2145  */
2146
2147 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2148                                           int mp_hash)
2149 {
2150         struct net_device *dev_out = NULL;
2151         __u8 tos = RT_FL_TOS(fl4);
2152         unsigned int flags = 0;
2153         struct fib_result res;
2154         struct rtable *rth;
2155         int orig_oif;
2156         int err = -ENETUNREACH;
2157
2158         res.tclassid    = 0;
2159         res.fi          = NULL;
2160         res.table       = NULL;
2161
2162         orig_oif = fl4->flowi4_oif;
2163
2164         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2165         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2166         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2167                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2168
2169         rcu_read_lock();
2170         if (fl4->saddr) {
2171                 rth = ERR_PTR(-EINVAL);
2172                 if (ipv4_is_multicast(fl4->saddr) ||
2173                     ipv4_is_lbcast(fl4->saddr) ||
2174                     ipv4_is_zeronet(fl4->saddr))
2175                         goto out;
2176
2177                 /* I removed check for oif == dev_out->oif here.
2178                    It was wrong for two reasons:
2179                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2180                       is assigned to multiple interfaces.
2181                    2. Moreover, we are allowed to send packets with saddr
2182                       of another iface. --ANK
2183                  */
2184
2185                 if (fl4->flowi4_oif == 0 &&
2186                     (ipv4_is_multicast(fl4->daddr) ||
2187                      ipv4_is_lbcast(fl4->daddr))) {
2188                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2189                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2190                         if (!dev_out)
2191                                 goto out;
2192
2193                         /* Special hack: user can direct multicasts
2194                            and limited broadcast via necessary interface
2195                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2196                            This hack is not just for fun, it allows
2197                            vic,vat and friends to work.
2198                            They bind socket to loopback, set ttl to zero
2199                            and expect that it will work.
2200                            From the viewpoint of routing cache they are broken,
2201                            because we are not allowed to build multicast path
2202                            with loopback source addr (look, routing cache
2203                            cannot know, that ttl is zero, so that packet
2204                            will not leave this host and route is valid).
2205                            Luckily, this hack is good workaround.
2206                          */
2207
2208                         fl4->flowi4_oif = dev_out->ifindex;
2209                         goto make_route;
2210                 }
2211
2212                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2213                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2214                         if (!__ip_dev_find(net, fl4->saddr, false))
2215                                 goto out;
2216                 }
2217         }
2218
2219
2220         if (fl4->flowi4_oif) {
2221                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2222                 rth = ERR_PTR(-ENODEV);
2223                 if (!dev_out)
2224                         goto out;
2225
2226                 /* RACE: Check return value of inet_select_addr instead. */
2227                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2228                         rth = ERR_PTR(-ENETUNREACH);
2229                         goto out;
2230                 }
2231                 if (ipv4_is_local_multicast(fl4->daddr) ||
2232                     ipv4_is_lbcast(fl4->daddr) ||
2233                     fl4->flowi4_proto == IPPROTO_IGMP) {
2234                         if (!fl4->saddr)
2235                                 fl4->saddr = inet_select_addr(dev_out, 0,
2236                                                               RT_SCOPE_LINK);
2237                         goto make_route;
2238                 }
2239                 if (!fl4->saddr) {
2240                         if (ipv4_is_multicast(fl4->daddr))
2241                                 fl4->saddr = inet_select_addr(dev_out, 0,
2242                                                               fl4->flowi4_scope);
2243                         else if (!fl4->daddr)
2244                                 fl4->saddr = inet_select_addr(dev_out, 0,
2245                                                               RT_SCOPE_HOST);
2246                 }
2247
2248                 rth = l3mdev_get_rtable(dev_out, fl4);
2249                 if (rth)
2250                         goto out;
2251         }
2252
2253         if (!fl4->daddr) {
2254                 fl4->daddr = fl4->saddr;
2255                 if (!fl4->daddr)
2256                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2257                 dev_out = net->loopback_dev;
2258                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2259                 res.type = RTN_LOCAL;
2260                 flags |= RTCF_LOCAL;
2261                 goto make_route;
2262         }
2263
2264         err = fib_lookup(net, fl4, &res, 0);
2265         if (err) {
2266                 res.fi = NULL;
2267                 res.table = NULL;
2268                 if (fl4->flowi4_oif &&
2269                     !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2270                         /* Apparently, routing tables are wrong. Assume,
2271                            that the destination is on link.
2272
2273                            WHY? DW.
2274                            Because we are allowed to send to iface
2275                            even if it has NO routes and NO assigned
2276                            addresses. When oif is specified, routing
2277                            tables are looked up with only one purpose:
2278                            to catch if destination is gatewayed, rather than
2279                            direct. Moreover, if MSG_DONTROUTE is set,
2280                            we send packet, ignoring both routing tables
2281                            and ifaddr state. --ANK
2282
2283
2284                            We could make it even if oif is unknown,
2285                            likely IPv6, but we do not.
2286                          */
2287
2288                         if (fl4->saddr == 0)
2289                                 fl4->saddr = inet_select_addr(dev_out, 0,
2290                                                               RT_SCOPE_LINK);
2291                         res.type = RTN_UNICAST;
2292                         goto make_route;
2293                 }
2294                 rth = ERR_PTR(err);
2295                 goto out;
2296         }
2297
2298         if (res.type == RTN_LOCAL) {
2299                 if (!fl4->saddr) {
2300                         if (res.fi->fib_prefsrc)
2301                                 fl4->saddr = res.fi->fib_prefsrc;
2302                         else
2303                                 fl4->saddr = fl4->daddr;
2304                 }
2305                 dev_out = net->loopback_dev;
2306                 fl4->flowi4_oif = dev_out->ifindex;
2307                 flags |= RTCF_LOCAL;
2308                 goto make_route;
2309         }
2310
2311         fib_select_path(net, &res, fl4, mp_hash);
2312
2313         dev_out = FIB_RES_DEV(res);
2314         fl4->flowi4_oif = dev_out->ifindex;
2315
2316
2317 make_route:
2318         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2319
2320 out:
2321         rcu_read_unlock();
2322         return rth;
2323 }
2324 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2325
2326 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2327 {
2328         return NULL;
2329 }
2330
2331 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2332 {
2333         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2334
2335         return mtu ? : dst->dev->mtu;
2336 }
2337
2338 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2339                                           struct sk_buff *skb, u32 mtu)
2340 {
2341 }
2342
2343 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2344                                        struct sk_buff *skb)
2345 {
2346 }
2347
2348 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2349                                           unsigned long old)
2350 {
2351         return NULL;
2352 }
2353
2354 static struct dst_ops ipv4_dst_blackhole_ops = {
2355         .family                 =       AF_INET,
2356         .check                  =       ipv4_blackhole_dst_check,
2357         .mtu                    =       ipv4_blackhole_mtu,
2358         .default_advmss         =       ipv4_default_advmss,
2359         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2360         .redirect               =       ipv4_rt_blackhole_redirect,
2361         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2362         .neigh_lookup           =       ipv4_neigh_lookup,
2363 };
2364
2365 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2366 {
2367         struct rtable *ort = (struct rtable *) dst_orig;
2368         struct rtable *rt;
2369
2370         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2371         if (rt) {
2372                 struct dst_entry *new = &rt->dst;
2373
2374                 new->__use = 1;
2375                 new->input = dst_discard;
2376                 new->output = dst_discard_out;
2377
2378                 new->dev = ort->dst.dev;
2379                 if (new->dev)
2380                         dev_hold(new->dev);
2381
2382                 rt->rt_is_input = ort->rt_is_input;
2383                 rt->rt_iif = ort->rt_iif;
2384                 rt->rt_pmtu = ort->rt_pmtu;
2385
2386                 rt->rt_genid = rt_genid_ipv4(net);
2387                 rt->rt_flags = ort->rt_flags;
2388                 rt->rt_type = ort->rt_type;
2389                 rt->rt_gateway = ort->rt_gateway;
2390                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2391
2392                 INIT_LIST_HEAD(&rt->rt_uncached);
2393                 dst_free(new);
2394         }
2395
2396         dst_release(dst_orig);
2397
2398         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2399 }
2400
2401 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2402                                     const struct sock *sk)
2403 {
2404         struct rtable *rt = __ip_route_output_key(net, flp4);
2405
2406         if (IS_ERR(rt))
2407                 return rt;
2408
2409         if (flp4->flowi4_proto)
2410                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2411                                                         flowi4_to_flowi(flp4),
2412                                                         sk, 0);
2413
2414         return rt;
2415 }
2416 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2417
2418 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2419                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2420                         u32 seq, int event, int nowait, unsigned int flags)
2421 {
2422         struct rtable *rt = skb_rtable(skb);
2423         struct rtmsg *r;
2424         struct nlmsghdr *nlh;
2425         unsigned long expires = 0;
2426         u32 error;
2427         u32 metrics[RTAX_MAX];
2428
2429         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2430         if (!nlh)
2431                 return -EMSGSIZE;
2432
2433         r = nlmsg_data(nlh);
2434         r->rtm_family    = AF_INET;
2435         r->rtm_dst_len  = 32;
2436         r->rtm_src_len  = 0;
2437         r->rtm_tos      = fl4->flowi4_tos;
2438         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2439         if (nla_put_u32(skb, RTA_TABLE, table_id))
2440                 goto nla_put_failure;
2441         r->rtm_type     = rt->rt_type;
2442         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2443         r->rtm_protocol = RTPROT_UNSPEC;
2444         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2445         if (rt->rt_flags & RTCF_NOTIFY)
2446                 r->rtm_flags |= RTM_F_NOTIFY;
2447         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2448                 r->rtm_flags |= RTCF_DOREDIRECT;
2449
2450         if (nla_put_in_addr(skb, RTA_DST, dst))
2451                 goto nla_put_failure;
2452         if (src) {
2453                 r->rtm_src_len = 32;
2454                 if (nla_put_in_addr(skb, RTA_SRC, src))
2455                         goto nla_put_failure;
2456         }
2457         if (rt->dst.dev &&
2458             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2459                 goto nla_put_failure;
2460 #ifdef CONFIG_IP_ROUTE_CLASSID
2461         if (rt->dst.tclassid &&
2462             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2463                 goto nla_put_failure;
2464 #endif
2465         if (!rt_is_input_route(rt) &&
2466             fl4->saddr != src) {
2467                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2468                         goto nla_put_failure;
2469         }
2470         if (rt->rt_uses_gateway &&
2471             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2472                 goto nla_put_failure;
2473
2474         expires = rt->dst.expires;
2475         if (expires) {
2476                 unsigned long now = jiffies;
2477
2478                 if (time_before(now, expires))
2479                         expires -= now;
2480                 else
2481                         expires = 0;
2482         }
2483
2484         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2485         if (rt->rt_pmtu && expires)
2486                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2487         if (rtnetlink_put_metrics(skb, metrics) < 0)
2488                 goto nla_put_failure;
2489
2490         if (fl4->flowi4_mark &&
2491             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2492                 goto nla_put_failure;
2493
2494         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2495             nla_put_u32(skb, RTA_UID,
2496                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2497                 goto nla_put_failure;
2498
2499         error = rt->dst.error;
2500
2501         if (rt_is_input_route(rt)) {
2502 #ifdef CONFIG_IP_MROUTE
2503                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2504                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2505                         int err = ipmr_get_route(net, skb,
2506                                                  fl4->saddr, fl4->daddr,
2507                                                  r, nowait, portid);
2508
2509                         if (err <= 0) {
2510                                 if (!nowait) {
2511                                         if (err == 0)
2512                                                 return 0;
2513                                         goto nla_put_failure;
2514                                 } else {
2515                                         if (err == -EMSGSIZE)
2516                                                 goto nla_put_failure;
2517                                         error = err;
2518                                 }
2519                         }
2520                 } else
2521 #endif
2522                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2523                                 goto nla_put_failure;
2524         }
2525
2526         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2527                 goto nla_put_failure;
2528
2529         nlmsg_end(skb, nlh);
2530         return 0;
2531
2532 nla_put_failure:
2533         nlmsg_cancel(skb, nlh);
2534         return -EMSGSIZE;
2535 }
2536
2537 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2538 {
2539         struct net *net = sock_net(in_skb->sk);
2540         struct rtmsg *rtm;
2541         struct nlattr *tb[RTA_MAX+1];
2542         struct rtable *rt = NULL;
2543         struct flowi4 fl4;
2544         __be32 dst = 0;
2545         __be32 src = 0;
2546         u32 iif;
2547         int err;
2548         int mark;
2549         struct sk_buff *skb;
2550         u32 table_id = RT_TABLE_MAIN;
2551         kuid_t uid;
2552
2553         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2554         if (err < 0)
2555                 goto errout;
2556
2557         rtm = nlmsg_data(nlh);
2558
2559         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2560         if (!skb) {
2561                 err = -ENOBUFS;
2562                 goto errout;
2563         }
2564
2565         /* Reserve room for dummy headers, this skb can pass
2566            through good chunk of routing engine.
2567          */
2568         skb_reset_mac_header(skb);
2569         skb_reset_network_header(skb);
2570
2571         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2572         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2573         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2574
2575         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2576         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2577         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2578         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2579         if (tb[RTA_UID])
2580                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2581         else
2582                 uid = (iif ? INVALID_UID : current_uid());
2583
2584         memset(&fl4, 0, sizeof(fl4));
2585         fl4.daddr = dst;
2586         fl4.saddr = src;
2587         fl4.flowi4_tos = rtm->rtm_tos;
2588         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2589         fl4.flowi4_mark = mark;
2590         fl4.flowi4_uid = uid;
2591
2592         if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2593                 fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2594
2595         if (iif) {
2596                 struct net_device *dev;
2597
2598                 dev = __dev_get_by_index(net, iif);
2599                 if (!dev) {
2600                         err = -ENODEV;
2601                         goto errout_free;
2602                 }
2603
2604                 skb->protocol   = htons(ETH_P_IP);
2605                 skb->dev        = dev;
2606                 skb->mark       = mark;
2607                 local_bh_disable();
2608                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2609                 local_bh_enable();
2610
2611                 rt = skb_rtable(skb);
2612                 if (err == 0 && rt->dst.error)
2613                         err = -rt->dst.error;
2614         } else {
2615                 rt = ip_route_output_key(net, &fl4);
2616
2617                 err = 0;
2618                 if (IS_ERR(rt))
2619                         err = PTR_ERR(rt);
2620         }
2621
2622         if (err)
2623                 goto errout_free;
2624
2625         skb_dst_set(skb, &rt->dst);
2626         if (rtm->rtm_flags & RTM_F_NOTIFY)
2627                 rt->rt_flags |= RTCF_NOTIFY;
2628
2629         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2630                 table_id = rt->rt_table_id;
2631
2632         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2633                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2634                            RTM_NEWROUTE, 0, 0);
2635         if (err < 0)
2636                 goto errout_free;
2637
2638         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2639 errout:
2640         return err;
2641
2642 errout_free:
2643         kfree_skb(skb);
2644         goto errout;
2645 }
2646
2647 void ip_rt_multicast_event(struct in_device *in_dev)
2648 {
2649         rt_cache_flush(dev_net(in_dev->dev));
2650 }
2651
2652 #ifdef CONFIG_SYSCTL
2653 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2654 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2655 static int ip_rt_gc_elasticity __read_mostly    = 8;
2656
2657 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2658                                         void __user *buffer,
2659                                         size_t *lenp, loff_t *ppos)
2660 {
2661         struct net *net = (struct net *)__ctl->extra1;
2662
2663         if (write) {
2664                 rt_cache_flush(net);
2665                 fnhe_genid_bump(net);
2666                 return 0;
2667         }
2668
2669         return -EINVAL;
2670 }
2671
2672 static struct ctl_table ipv4_route_table[] = {
2673         {
2674                 .procname       = "gc_thresh",
2675                 .data           = &ipv4_dst_ops.gc_thresh,
2676                 .maxlen         = sizeof(int),
2677                 .mode           = 0644,
2678                 .proc_handler   = proc_dointvec,
2679         },
2680         {
2681                 .procname       = "max_size",
2682                 .data           = &ip_rt_max_size,
2683                 .maxlen         = sizeof(int),
2684                 .mode           = 0644,
2685                 .proc_handler   = proc_dointvec,
2686         },
2687         {
2688                 /*  Deprecated. Use gc_min_interval_ms */
2689
2690                 .procname       = "gc_min_interval",
2691                 .data           = &ip_rt_gc_min_interval,
2692                 .maxlen         = sizeof(int),
2693                 .mode           = 0644,
2694                 .proc_handler   = proc_dointvec_jiffies,
2695         },
2696         {
2697                 .procname       = "gc_min_interval_ms",
2698                 .data           = &ip_rt_gc_min_interval,
2699                 .maxlen         = sizeof(int),
2700                 .mode           = 0644,
2701                 .proc_handler   = proc_dointvec_ms_jiffies,
2702         },
2703         {
2704                 .procname       = "gc_timeout",
2705                 .data           = &ip_rt_gc_timeout,
2706                 .maxlen         = sizeof(int),
2707                 .mode           = 0644,
2708                 .proc_handler   = proc_dointvec_jiffies,
2709         },
2710         {
2711                 .procname       = "gc_interval",
2712                 .data           = &ip_rt_gc_interval,
2713                 .maxlen         = sizeof(int),
2714                 .mode           = 0644,
2715                 .proc_handler   = proc_dointvec_jiffies,
2716         },
2717         {
2718                 .procname       = "redirect_load",
2719                 .data           = &ip_rt_redirect_load,
2720                 .maxlen         = sizeof(int),
2721                 .mode           = 0644,
2722                 .proc_handler   = proc_dointvec,
2723         },
2724         {
2725                 .procname       = "redirect_number",
2726                 .data           = &ip_rt_redirect_number,
2727                 .maxlen         = sizeof(int),
2728                 .mode           = 0644,
2729                 .proc_handler   = proc_dointvec,
2730         },
2731         {
2732                 .procname       = "redirect_silence",
2733                 .data           = &ip_rt_redirect_silence,
2734                 .maxlen         = sizeof(int),
2735                 .mode           = 0644,
2736                 .proc_handler   = proc_dointvec,
2737         },
2738         {
2739                 .procname       = "error_cost",
2740                 .data           = &ip_rt_error_cost,
2741                 .maxlen         = sizeof(int),
2742                 .mode           = 0644,
2743                 .proc_handler   = proc_dointvec,
2744         },
2745         {
2746                 .procname       = "error_burst",
2747                 .data           = &ip_rt_error_burst,
2748                 .maxlen         = sizeof(int),
2749                 .mode           = 0644,
2750                 .proc_handler   = proc_dointvec,
2751         },
2752         {
2753                 .procname       = "gc_elasticity",
2754                 .data           = &ip_rt_gc_elasticity,
2755                 .maxlen         = sizeof(int),
2756                 .mode           = 0644,
2757                 .proc_handler   = proc_dointvec,
2758         },
2759         {
2760                 .procname       = "mtu_expires",
2761                 .data           = &ip_rt_mtu_expires,
2762                 .maxlen         = sizeof(int),
2763                 .mode           = 0644,
2764                 .proc_handler   = proc_dointvec_jiffies,
2765         },
2766         {
2767                 .procname       = "min_pmtu",
2768                 .data           = &ip_rt_min_pmtu,
2769                 .maxlen         = sizeof(int),
2770                 .mode           = 0644,
2771                 .proc_handler   = proc_dointvec,
2772         },
2773         {
2774                 .procname       = "min_adv_mss",
2775                 .data           = &ip_rt_min_advmss,
2776                 .maxlen         = sizeof(int),
2777                 .mode           = 0644,
2778                 .proc_handler   = proc_dointvec,
2779         },
2780         { }
2781 };
2782
2783 static struct ctl_table ipv4_route_flush_table[] = {
2784         {
2785                 .procname       = "flush",
2786                 .maxlen         = sizeof(int),
2787                 .mode           = 0200,
2788                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2789         },
2790         { },
2791 };
2792
2793 static __net_init int sysctl_route_net_init(struct net *net)
2794 {
2795         struct ctl_table *tbl;
2796
2797         tbl = ipv4_route_flush_table;
2798         if (!net_eq(net, &init_net)) {
2799                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2800                 if (!tbl)
2801                         goto err_dup;
2802
2803                 /* Don't export sysctls to unprivileged users */
2804                 if (net->user_ns != &init_user_ns)
2805                         tbl[0].procname = NULL;
2806         }
2807         tbl[0].extra1 = net;
2808
2809         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2810         if (!net->ipv4.route_hdr)
2811                 goto err_reg;
2812         return 0;
2813
2814 err_reg:
2815         if (tbl != ipv4_route_flush_table)
2816                 kfree(tbl);
2817 err_dup:
2818         return -ENOMEM;
2819 }
2820
2821 static __net_exit void sysctl_route_net_exit(struct net *net)
2822 {
2823         struct ctl_table *tbl;
2824
2825         tbl = net->ipv4.route_hdr->ctl_table_arg;
2826         unregister_net_sysctl_table(net->ipv4.route_hdr);
2827         BUG_ON(tbl == ipv4_route_flush_table);
2828         kfree(tbl);
2829 }
2830
2831 static __net_initdata struct pernet_operations sysctl_route_ops = {
2832         .init = sysctl_route_net_init,
2833         .exit = sysctl_route_net_exit,
2834 };
2835 #endif
2836
2837 static __net_init int rt_genid_init(struct net *net)
2838 {
2839         atomic_set(&net->ipv4.rt_genid, 0);
2840         atomic_set(&net->fnhe_genid, 0);
2841         get_random_bytes(&net->ipv4.dev_addr_genid,
2842                          sizeof(net->ipv4.dev_addr_genid));
2843         return 0;
2844 }
2845
2846 static __net_initdata struct pernet_operations rt_genid_ops = {
2847         .init = rt_genid_init,
2848 };
2849
2850 static int __net_init ipv4_inetpeer_init(struct net *net)
2851 {
2852         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2853
2854         if (!bp)
2855                 return -ENOMEM;
2856         inet_peer_base_init(bp);
2857         net->ipv4.peers = bp;
2858         return 0;
2859 }
2860
2861 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2862 {
2863         struct inet_peer_base *bp = net->ipv4.peers;
2864
2865         net->ipv4.peers = NULL;
2866         inetpeer_invalidate_tree(bp);
2867         kfree(bp);
2868 }
2869
2870 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2871         .init   =       ipv4_inetpeer_init,
2872         .exit   =       ipv4_inetpeer_exit,
2873 };
2874
2875 #ifdef CONFIG_IP_ROUTE_CLASSID
2876 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2877 #endif /* CONFIG_IP_ROUTE_CLASSID */
2878
2879 int __init ip_rt_init(void)
2880 {
2881         int rc = 0;
2882         int cpu;
2883
2884         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2885         if (!ip_idents)
2886                 panic("IP: failed to allocate ip_idents\n");
2887
2888         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2889
2890         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2891         if (!ip_tstamps)
2892                 panic("IP: failed to allocate ip_tstamps\n");
2893
2894         for_each_possible_cpu(cpu) {
2895                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2896
2897                 INIT_LIST_HEAD(&ul->head);
2898                 spin_lock_init(&ul->lock);
2899         }
2900 #ifdef CONFIG_IP_ROUTE_CLASSID
2901         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2902         if (!ip_rt_acct)
2903                 panic("IP: failed to allocate ip_rt_acct\n");
2904 #endif
2905
2906         ipv4_dst_ops.kmem_cachep =
2907                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2908                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2909
2910         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2911
2912         if (dst_entries_init(&ipv4_dst_ops) < 0)
2913                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2914
2915         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2916                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2917
2918         ipv4_dst_ops.gc_thresh = ~0;
2919         ip_rt_max_size = INT_MAX;
2920
2921         devinet_init();
2922         ip_fib_init();
2923
2924         if (ip_rt_proc_init())
2925                 pr_err("Unable to create route proc files\n");
2926 #ifdef CONFIG_XFRM
2927         xfrm_init();
2928         xfrm4_init();
2929 #endif
2930         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2931
2932 #ifdef CONFIG_SYSCTL
2933         register_pernet_subsys(&sysctl_route_ops);
2934 #endif
2935         register_pernet_subsys(&rt_genid_ops);
2936         register_pernet_subsys(&ipv4_inetpeer_ops);
2937         return rc;
2938 }
2939
2940 #ifdef CONFIG_SYSCTL
2941 /*
2942  * We really need to sanitize the damn ipv4 init order, then all
2943  * this nonsense will go away.
2944  */
2945 void __init ip_static_sysctl_init(void)
2946 {
2947         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2948 }
2949 #endif