2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
112 #include <net/secure_seq.h>
114 #define RT_FL_TOS(oldflp4) \
115 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
117 #define IP_MAX_MTU 0xFFF0
119 #define RT_GC_TIMEOUT (300*HZ)
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly = 8;
131 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly = 256;
134 static int rt_chain_length_max __read_mostly = 20;
136 static struct delayed_work expires_work;
137 static unsigned long expires_ljiffies;
140 * Interface to generic destination cache.
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
145 static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void ipv4_link_failure(struct sk_buff *skb);
149 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(struct dst_ops *ops);
152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159 struct rtable *rt = (struct rtable *) dst;
160 struct inet_peer *peer;
164 rt_bind_peer(rt, rt->rt_dst, 1);
168 u32 *old_p = __DST_METRICS_PTR(old);
169 unsigned long prev, new;
172 if (inet_metrics_new(peer))
173 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
175 new = (unsigned long) p;
176 prev = cmpxchg(&dst->_metrics, old, new);
179 p = __DST_METRICS_PTR(prev);
180 if (prev & DST_METRICS_READ_ONLY)
184 fib_info_put(rt->fi);
192 static struct dst_ops ipv4_dst_ops = {
194 .protocol = cpu_to_be16(ETH_P_IP),
195 .gc = rt_garbage_collect,
196 .check = ipv4_dst_check,
197 .default_advmss = ipv4_default_advmss,
198 .default_mtu = ipv4_default_mtu,
199 .cow_metrics = ipv4_cow_metrics,
200 .destroy = ipv4_dst_destroy,
201 .ifdown = ipv4_dst_ifdown,
202 .negative_advice = ipv4_negative_advice,
203 .link_failure = ipv4_link_failure,
204 .update_pmtu = ip_rt_update_pmtu,
205 .local_out = __ip_local_out,
208 #define ECN_OR_COST(class) TC_PRIO_##class
210 const __u8 ip_tos2prio[16] = {
212 ECN_OR_COST(BESTEFFORT),
214 ECN_OR_COST(BESTEFFORT),
220 ECN_OR_COST(INTERACTIVE),
222 ECN_OR_COST(INTERACTIVE),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK),
225 TC_PRIO_INTERACTIVE_BULK,
226 ECN_OR_COST(INTERACTIVE_BULK)
234 /* The locking scheme is rather straight forward:
236 * 1) Read-Copy Update protects the buckets of the central route hash.
237 * 2) Only writers remove entries, and they hold the lock
238 * as they look at rtable reference counts.
239 * 3) Only readers acquire references to rtable entries,
240 * they do so with atomic increments and with the
244 struct rt_hash_bucket {
245 struct rtable __rcu *chain;
248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249 defined(CONFIG_PROVE_LOCKING)
251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252 * The size of this table is a power of two and depends on the number of CPUS.
253 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
255 #ifdef CONFIG_LOCKDEP
256 # define RT_HASH_LOCK_SZ 256
259 # define RT_HASH_LOCK_SZ 4096
261 # define RT_HASH_LOCK_SZ 2048
263 # define RT_HASH_LOCK_SZ 1024
265 # define RT_HASH_LOCK_SZ 512
267 # define RT_HASH_LOCK_SZ 256
271 static spinlock_t *rt_hash_locks;
272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
274 static __init void rt_hash_lock_init(void)
278 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
281 panic("IP: failed to allocate rt_hash_locks\n");
283 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
284 spin_lock_init(&rt_hash_locks[i]);
287 # define rt_hash_lock_addr(slot) NULL
289 static inline void rt_hash_lock_init(void)
294 static struct rt_hash_bucket *rt_hash_table __read_mostly;
295 static unsigned rt_hash_mask __read_mostly;
296 static unsigned int rt_hash_log __read_mostly;
298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
304 return jhash_3words((__force u32)daddr, (__force u32)saddr,
309 static inline int rt_genid(struct net *net)
311 return atomic_read(&net->ipv4.rt_genid);
314 #ifdef CONFIG_PROC_FS
315 struct rt_cache_iter_state {
316 struct seq_net_private p;
321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
323 struct rt_cache_iter_state *st = seq->private;
324 struct rtable *r = NULL;
326 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
327 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
330 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
332 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
333 r->rt_genid == st->genid)
335 r = rcu_dereference_bh(r->dst.rt_next);
337 rcu_read_unlock_bh();
342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
345 struct rt_cache_iter_state *st = seq->private;
347 r = rcu_dereference_bh(r->dst.rt_next);
349 rcu_read_unlock_bh();
351 if (--st->bucket < 0)
353 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
355 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
363 struct rt_cache_iter_state *st = seq->private;
364 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
365 if (dev_net(r->dst.dev) != seq_file_net(seq))
367 if (r->rt_genid == st->genid)
373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
375 struct rtable *r = rt_cache_get_first(seq);
378 while (pos && (r = rt_cache_get_next(seq, r)))
380 return pos ? NULL : r;
383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
385 struct rt_cache_iter_state *st = seq->private;
387 return rt_cache_get_idx(seq, *pos - 1);
388 st->genid = rt_genid(seq_file_net(seq));
389 return SEQ_START_TOKEN;
392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
396 if (v == SEQ_START_TOKEN)
397 r = rt_cache_get_first(seq);
399 r = rt_cache_get_next(seq, v);
404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
406 if (v && v != SEQ_START_TOKEN)
407 rcu_read_unlock_bh();
410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
412 if (v == SEQ_START_TOKEN)
413 seq_printf(seq, "%-127s\n",
414 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
418 struct rtable *r = v;
423 n = dst_get_neighbour(&r->dst);
424 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
427 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
428 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
429 r->dst.dev ? r->dst.dev->name : "*",
430 (__force u32)r->rt_dst,
431 (__force u32)r->rt_gateway,
432 r->rt_flags, atomic_read(&r->dst.__refcnt),
433 r->dst.__use, 0, (__force u32)r->rt_src,
434 dst_metric_advmss(&r->dst) + 40,
435 dst_metric(&r->dst, RTAX_WINDOW),
436 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
437 dst_metric(&r->dst, RTAX_RTTVAR)),
439 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
441 r->rt_spec_dst, &len);
443 seq_printf(seq, "%*s\n", 127 - len, "");
448 static const struct seq_operations rt_cache_seq_ops = {
449 .start = rt_cache_seq_start,
450 .next = rt_cache_seq_next,
451 .stop = rt_cache_seq_stop,
452 .show = rt_cache_seq_show,
455 static int rt_cache_seq_open(struct inode *inode, struct file *file)
457 return seq_open_net(inode, file, &rt_cache_seq_ops,
458 sizeof(struct rt_cache_iter_state));
461 static const struct file_operations rt_cache_seq_fops = {
462 .owner = THIS_MODULE,
463 .open = rt_cache_seq_open,
466 .release = seq_release_net,
470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475 return SEQ_START_TOKEN;
477 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
478 if (!cpu_possible(cpu))
481 return &per_cpu(rt_cache_stat, cpu);
486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
490 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
491 if (!cpu_possible(cpu))
494 return &per_cpu(rt_cache_stat, cpu);
500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
507 struct rt_cache_stat *st = v;
509 if (v == SEQ_START_TOKEN) {
510 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
514 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
515 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
516 dst_entries_get_slow(&ipv4_dst_ops),
539 static const struct seq_operations rt_cpu_seq_ops = {
540 .start = rt_cpu_seq_start,
541 .next = rt_cpu_seq_next,
542 .stop = rt_cpu_seq_stop,
543 .show = rt_cpu_seq_show,
547 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
549 return seq_open(file, &rt_cpu_seq_ops);
552 static const struct file_operations rt_cpu_seq_fops = {
553 .owner = THIS_MODULE,
554 .open = rt_cpu_seq_open,
557 .release = seq_release,
560 #ifdef CONFIG_IP_ROUTE_CLASSID
561 static int rt_acct_proc_show(struct seq_file *m, void *v)
563 struct ip_rt_acct *dst, *src;
566 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
570 for_each_possible_cpu(i) {
571 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
572 for (j = 0; j < 256; j++) {
573 dst[j].o_bytes += src[j].o_bytes;
574 dst[j].o_packets += src[j].o_packets;
575 dst[j].i_bytes += src[j].i_bytes;
576 dst[j].i_packets += src[j].i_packets;
580 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 static int rt_acct_proc_open(struct inode *inode, struct file *file)
587 return single_open(file, rt_acct_proc_show, NULL);
590 static const struct file_operations rt_acct_proc_fops = {
591 .owner = THIS_MODULE,
592 .open = rt_acct_proc_open,
595 .release = single_release,
599 static int __net_init ip_rt_do_proc_init(struct net *net)
601 struct proc_dir_entry *pde;
603 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 pde = proc_create("rt_cache", S_IRUGO,
609 net->proc_net_stat, &rt_cpu_seq_fops);
613 #ifdef CONFIG_IP_ROUTE_CLASSID
614 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
620 #ifdef CONFIG_IP_ROUTE_CLASSID
622 remove_proc_entry("rt_cache", net->proc_net_stat);
625 remove_proc_entry("rt_cache", net->proc_net);
630 static void __net_exit ip_rt_do_proc_exit(struct net *net)
632 remove_proc_entry("rt_cache", net->proc_net_stat);
633 remove_proc_entry("rt_cache", net->proc_net);
634 #ifdef CONFIG_IP_ROUTE_CLASSID
635 remove_proc_entry("rt_acct", net->proc_net);
639 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
640 .init = ip_rt_do_proc_init,
641 .exit = ip_rt_do_proc_exit,
644 static int __init ip_rt_proc_init(void)
646 return register_pernet_subsys(&ip_rt_proc_ops);
650 static inline int ip_rt_proc_init(void)
654 #endif /* CONFIG_PROC_FS */
656 static inline void rt_free(struct rtable *rt)
658 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
661 static inline void rt_drop(struct rtable *rt)
664 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
667 static inline int rt_fast_clean(struct rtable *rth)
669 /* Kill broadcast/multicast entries very aggresively, if they
670 collide in hash table with more useful entries */
671 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
672 rt_is_input_route(rth) && rth->dst.rt_next;
675 static inline int rt_valuable(struct rtable *rth)
677 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
678 (rth->peer && rth->peer->pmtu_expires);
681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686 if (atomic_read(&rth->dst.__refcnt))
689 age = jiffies - rth->dst.lastuse;
690 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
691 (age <= tmo2 && rt_valuable(rth)))
697 /* Bits of score are:
699 * 30: not quite useless
700 * 29..0: usage counter
702 static inline u32 rt_score(struct rtable *rt)
704 u32 score = jiffies - rt->dst.lastuse;
706 score = ~score & ~(3<<30);
711 if (rt_is_output_route(rt) ||
712 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
718 static inline bool rt_caching(const struct net *net)
720 return net->ipv4.current_rt_cache_rebuild_count <=
721 net->ipv4.sysctl_rt_cache_rebuild_count;
724 static inline bool compare_hash_inputs(const struct rtable *rt1,
725 const struct rtable *rt2)
727 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
728 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
729 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
734 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
735 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
736 (rt1->rt_mark ^ rt2->rt_mark) |
737 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
738 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
739 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
744 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
747 static inline int rt_is_expired(struct rtable *rth)
749 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
753 * Perform a full scan of hash table and free all entries.
754 * Can be called by a softirq or a process.
755 * In the later case, we want to be reschedule if necessary
757 static void rt_do_flush(struct net *net, int process_context)
760 struct rtable *rth, *next;
762 for (i = 0; i <= rt_hash_mask; i++) {
763 struct rtable __rcu **pprev;
766 if (process_context && need_resched())
768 rth = rcu_dereference_raw(rt_hash_table[i].chain);
772 spin_lock_bh(rt_hash_lock_addr(i));
775 pprev = &rt_hash_table[i].chain;
776 rth = rcu_dereference_protected(*pprev,
777 lockdep_is_held(rt_hash_lock_addr(i)));
780 next = rcu_dereference_protected(rth->dst.rt_next,
781 lockdep_is_held(rt_hash_lock_addr(i)));
784 net_eq(dev_net(rth->dst.dev), net)) {
785 rcu_assign_pointer(*pprev, next);
786 rcu_assign_pointer(rth->dst.rt_next, list);
789 pprev = &rth->dst.rt_next;
794 spin_unlock_bh(rt_hash_lock_addr(i));
796 for (; list; list = next) {
797 next = rcu_dereference_protected(list->dst.rt_next, 1);
804 * While freeing expired entries, we compute average chain length
805 * and standard deviation, using fixed-point arithmetic.
806 * This to have an estimation of rt_chain_length_max
807 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
808 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
812 #define ONE (1UL << FRACT_BITS)
815 * Given a hash chain and an item in this hash chain,
816 * find if a previous entry has the same hash_inputs
817 * (but differs on tos, mark or oif)
818 * Returns 0 if an alias is found.
819 * Returns ONE if rth has no alias before itself.
821 static int has_noalias(const struct rtable *head, const struct rtable *rth)
823 const struct rtable *aux = head;
826 if (compare_hash_inputs(aux, rth))
828 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
833 static void rt_check_expire(void)
835 static unsigned int rover;
836 unsigned int i = rover, goal;
838 struct rtable __rcu **rthp;
839 unsigned long samples = 0;
840 unsigned long sum = 0, sum2 = 0;
844 delta = jiffies - expires_ljiffies;
845 expires_ljiffies = jiffies;
846 mult = ((u64)delta) << rt_hash_log;
847 if (ip_rt_gc_timeout > 1)
848 do_div(mult, ip_rt_gc_timeout);
849 goal = (unsigned int)mult;
850 if (goal > rt_hash_mask)
851 goal = rt_hash_mask + 1;
852 for (; goal > 0; goal--) {
853 unsigned long tmo = ip_rt_gc_timeout;
854 unsigned long length;
856 i = (i + 1) & rt_hash_mask;
857 rthp = &rt_hash_table[i].chain;
864 if (rcu_dereference_raw(*rthp) == NULL)
867 spin_lock_bh(rt_hash_lock_addr(i));
868 while ((rth = rcu_dereference_protected(*rthp,
869 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
870 prefetch(rth->dst.rt_next);
871 if (rt_is_expired(rth)) {
872 *rthp = rth->dst.rt_next;
876 if (rth->dst.expires) {
877 /* Entry is expired even if it is in use */
878 if (time_before_eq(jiffies, rth->dst.expires)) {
881 rthp = &rth->dst.rt_next;
883 * We only count entries on
884 * a chain with equal hash inputs once
885 * so that entries for different QOS
886 * levels, and other non-hash input
887 * attributes don't unfairly skew
888 * the length computation
890 length += has_noalias(rt_hash_table[i].chain, rth);
893 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
896 /* Cleanup aged off entries. */
897 *rthp = rth->dst.rt_next;
900 spin_unlock_bh(rt_hash_lock_addr(i));
902 sum2 += length*length;
905 unsigned long avg = sum / samples;
906 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
907 rt_chain_length_max = max_t(unsigned long,
909 (avg + 4*sd) >> FRACT_BITS);
915 * rt_worker_func() is run in process context.
916 * we call rt_check_expire() to scan part of the hash table
918 static void rt_worker_func(struct work_struct *work)
921 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
925 * Perturbation of rt_genid by a small quantity [1..256]
926 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
927 * many times (2^24) without giving recent rt_genid.
928 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
930 static void rt_cache_invalidate(struct net *net)
932 unsigned char shuffle;
934 get_random_bytes(&shuffle, sizeof(shuffle));
935 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
939 * delay < 0 : invalidate cache (fast : entries will be deleted later)
940 * delay >= 0 : invalidate & flush cache (can be long)
942 void rt_cache_flush(struct net *net, int delay)
944 rt_cache_invalidate(net);
946 rt_do_flush(net, !in_softirq());
949 /* Flush previous cache invalidated entries from the cache */
950 void rt_cache_flush_batch(struct net *net)
952 rt_do_flush(net, !in_softirq());
955 static void rt_emergency_hash_rebuild(struct net *net)
958 printk(KERN_WARNING "Route hash chain too long!\n");
959 rt_cache_invalidate(net);
963 Short description of GC goals.
965 We want to build algorithm, which will keep routing cache
966 at some equilibrium point, when number of aged off entries
967 is kept approximately equal to newly generated ones.
969 Current expiration strength is variable "expire".
970 We try to adjust it dynamically, so that if networking
971 is idle expires is large enough to keep enough of warm entries,
972 and when load increases it reduces to limit cache size.
975 static int rt_garbage_collect(struct dst_ops *ops)
977 static unsigned long expire = RT_GC_TIMEOUT;
978 static unsigned long last_gc;
980 static int equilibrium;
982 struct rtable __rcu **rthp;
983 unsigned long now = jiffies;
985 int entries = dst_entries_get_fast(&ipv4_dst_ops);
988 * Garbage collection is pretty expensive,
989 * do not make it too frequently.
992 RT_CACHE_STAT_INC(gc_total);
994 if (now - last_gc < ip_rt_gc_min_interval &&
995 entries < ip_rt_max_size) {
996 RT_CACHE_STAT_INC(gc_ignored);
1000 entries = dst_entries_get_slow(&ipv4_dst_ops);
1001 /* Calculate number of entries, which we want to expire now. */
1002 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1004 if (equilibrium < ipv4_dst_ops.gc_thresh)
1005 equilibrium = ipv4_dst_ops.gc_thresh;
1006 goal = entries - equilibrium;
1008 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1009 goal = entries - equilibrium;
1012 /* We are in dangerous area. Try to reduce cache really
1015 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1016 equilibrium = entries - goal;
1019 if (now - last_gc >= ip_rt_gc_min_interval)
1023 equilibrium += goal;
1030 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1031 unsigned long tmo = expire;
1033 k = (k + 1) & rt_hash_mask;
1034 rthp = &rt_hash_table[k].chain;
1035 spin_lock_bh(rt_hash_lock_addr(k));
1036 while ((rth = rcu_dereference_protected(*rthp,
1037 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1038 if (!rt_is_expired(rth) &&
1039 !rt_may_expire(rth, tmo, expire)) {
1041 rthp = &rth->dst.rt_next;
1044 *rthp = rth->dst.rt_next;
1048 spin_unlock_bh(rt_hash_lock_addr(k));
1057 /* Goal is not achieved. We stop process if:
1059 - if expire reduced to zero. Otherwise, expire is halfed.
1060 - if table is not full.
1061 - if we are called from interrupt.
1062 - jiffies check is just fallback/debug loop breaker.
1063 We will not spin here for long time in any case.
1066 RT_CACHE_STAT_INC(gc_goal_miss);
1073 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1075 } while (!in_softirq() && time_before_eq(jiffies, now));
1077 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1081 if (net_ratelimit())
1082 printk(KERN_WARNING "dst cache overflow\n");
1083 RT_CACHE_STAT_INC(gc_dst_overflow);
1087 expire += ip_rt_gc_min_interval;
1088 if (expire > ip_rt_gc_timeout ||
1089 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1090 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1091 expire = ip_rt_gc_timeout;
1096 * Returns number of entries in a hash chain that have different hash_inputs
1098 static int slow_chain_length(const struct rtable *head)
1101 const struct rtable *rth = head;
1104 length += has_noalias(head, rth);
1105 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1107 return length >> FRACT_BITS;
1110 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1111 struct sk_buff *skb, int ifindex)
1113 struct rtable *rth, *cand;
1114 struct rtable __rcu **rthp, **candp;
1118 int attempts = !in_softirq();
1122 min_score = ~(u32)0;
1127 if (!rt_caching(dev_net(rt->dst.dev))) {
1129 * If we're not caching, just tell the caller we
1130 * were successful and don't touch the route. The
1131 * caller hold the sole reference to the cache entry, and
1132 * it will be released when the caller is done with it.
1133 * If we drop it here, the callers have no way to resolve routes
1134 * when we're not caching. Instead, just point *rp at rt, so
1135 * the caller gets a single use out of the route
1136 * Note that we do rt_free on this new route entry, so that
1137 * once its refcount hits zero, we are still able to reap it
1139 * Note: To avoid expensive rcu stuff for this uncached dst,
1140 * we set DST_NOCACHE so that dst_release() can free dst without
1141 * waiting a grace period.
1144 rt->dst.flags |= DST_NOCACHE;
1145 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1146 int err = arp_bind_neighbour(&rt->dst);
1148 if (net_ratelimit())
1150 "Neighbour table failure & not caching routes.\n");
1152 return ERR_PTR(err);
1159 rthp = &rt_hash_table[hash].chain;
1161 spin_lock_bh(rt_hash_lock_addr(hash));
1162 while ((rth = rcu_dereference_protected(*rthp,
1163 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1164 if (rt_is_expired(rth)) {
1165 *rthp = rth->dst.rt_next;
1169 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1171 *rthp = rth->dst.rt_next;
1173 * Since lookup is lockfree, the deletion
1174 * must be visible to another weakly ordered CPU before
1175 * the insertion at the start of the hash chain.
1177 rcu_assign_pointer(rth->dst.rt_next,
1178 rt_hash_table[hash].chain);
1180 * Since lookup is lockfree, the update writes
1181 * must be ordered for consistency on SMP.
1183 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1185 dst_use(&rth->dst, now);
1186 spin_unlock_bh(rt_hash_lock_addr(hash));
1190 skb_dst_set(skb, &rth->dst);
1194 if (!atomic_read(&rth->dst.__refcnt)) {
1195 u32 score = rt_score(rth);
1197 if (score <= min_score) {
1206 rthp = &rth->dst.rt_next;
1210 /* ip_rt_gc_elasticity used to be average length of chain
1211 * length, when exceeded gc becomes really aggressive.
1213 * The second limit is less certain. At the moment it allows
1214 * only 2 entries per bucket. We will see.
1216 if (chain_length > ip_rt_gc_elasticity) {
1217 *candp = cand->dst.rt_next;
1221 if (chain_length > rt_chain_length_max &&
1222 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1223 struct net *net = dev_net(rt->dst.dev);
1224 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1225 if (!rt_caching(net)) {
1226 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1227 rt->dst.dev->name, num);
1229 rt_emergency_hash_rebuild(net);
1230 spin_unlock_bh(rt_hash_lock_addr(hash));
1232 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1233 ifindex, rt_genid(net));
1238 /* Try to bind route to arp only if it is output
1239 route or unicast forwarding path.
1241 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1242 int err = arp_bind_neighbour(&rt->dst);
1244 spin_unlock_bh(rt_hash_lock_addr(hash));
1246 if (err != -ENOBUFS) {
1248 return ERR_PTR(err);
1251 /* Neighbour tables are full and nothing
1252 can be released. Try to shrink route cache,
1253 it is most likely it holds some neighbour records.
1255 if (attempts-- > 0) {
1256 int saved_elasticity = ip_rt_gc_elasticity;
1257 int saved_int = ip_rt_gc_min_interval;
1258 ip_rt_gc_elasticity = 1;
1259 ip_rt_gc_min_interval = 0;
1260 rt_garbage_collect(&ipv4_dst_ops);
1261 ip_rt_gc_min_interval = saved_int;
1262 ip_rt_gc_elasticity = saved_elasticity;
1266 if (net_ratelimit())
1267 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1269 return ERR_PTR(-ENOBUFS);
1273 rt->dst.rt_next = rt_hash_table[hash].chain;
1276 * Since lookup is lockfree, we must make sure
1277 * previous writes to rt are committed to memory
1278 * before making rt visible to other CPUS.
1280 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1282 spin_unlock_bh(rt_hash_lock_addr(hash));
1286 skb_dst_set(skb, &rt->dst);
1290 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1292 static u32 rt_peer_genid(void)
1294 return atomic_read(&__rt_peer_genid);
1297 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1299 struct inet_peer *peer;
1301 peer = inet_getpeer_v4(daddr, create);
1303 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1306 rt->rt_peer_genid = rt_peer_genid();
1310 * Peer allocation may fail only in serious out-of-memory conditions. However
1311 * we still can generate some output.
1312 * Random ID selection looks a bit dangerous because we have no chances to
1313 * select ID being unique in a reasonable period of time.
1314 * But broken packet identifier may be better than no packet at all.
1316 static void ip_select_fb_ident(struct iphdr *iph)
1318 static DEFINE_SPINLOCK(ip_fb_id_lock);
1319 static u32 ip_fallback_id;
1322 spin_lock_bh(&ip_fb_id_lock);
1323 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1324 iph->id = htons(salt & 0xFFFF);
1325 ip_fallback_id = salt;
1326 spin_unlock_bh(&ip_fb_id_lock);
1329 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1331 struct rtable *rt = (struct rtable *) dst;
1334 if (rt->peer == NULL)
1335 rt_bind_peer(rt, rt->rt_dst, 1);
1337 /* If peer is attached to destination, it is never detached,
1338 so that we need not to grab a lock to dereference it.
1341 iph->id = htons(inet_getid(rt->peer, more));
1345 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1346 __builtin_return_address(0));
1348 ip_select_fb_ident(iph);
1350 EXPORT_SYMBOL(__ip_select_ident);
1352 static void rt_del(unsigned hash, struct rtable *rt)
1354 struct rtable __rcu **rthp;
1357 rthp = &rt_hash_table[hash].chain;
1358 spin_lock_bh(rt_hash_lock_addr(hash));
1360 while ((aux = rcu_dereference_protected(*rthp,
1361 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1362 if (aux == rt || rt_is_expired(aux)) {
1363 *rthp = aux->dst.rt_next;
1367 rthp = &aux->dst.rt_next;
1369 spin_unlock_bh(rt_hash_lock_addr(hash));
1372 /* called in rcu_read_lock() section */
1373 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1374 __be32 saddr, struct net_device *dev)
1376 struct in_device *in_dev = __in_dev_get_rcu(dev);
1377 struct inet_peer *peer;
1384 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1385 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1386 ipv4_is_zeronet(new_gw))
1387 goto reject_redirect;
1389 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1390 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1391 goto reject_redirect;
1392 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1393 goto reject_redirect;
1395 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1396 goto reject_redirect;
1399 peer = inet_getpeer_v4(daddr, 1);
1401 peer->redirect_learned.a4 = new_gw;
1405 atomic_inc(&__rt_peer_genid);
1410 #ifdef CONFIG_IP_ROUTE_VERBOSE
1411 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1412 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1413 " Advised path = %pI4 -> %pI4\n",
1414 &old_gw, dev->name, &new_gw,
1420 static bool peer_pmtu_expired(struct inet_peer *peer)
1422 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1425 time_after_eq(jiffies, orig) &&
1426 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1429 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1431 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1434 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1437 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1439 struct rtable *rt = (struct rtable *)dst;
1440 struct dst_entry *ret = dst;
1443 if (dst->obsolete > 0) {
1446 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1447 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1449 rt_genid(dev_net(dst->dev)));
1452 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1453 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1461 * 1. The first ip_rt_redirect_number redirects are sent
1462 * with exponential backoff, then we stop sending them at all,
1463 * assuming that the host ignores our redirects.
1464 * 2. If we did not see packets requiring redirects
1465 * during ip_rt_redirect_silence, we assume that the host
1466 * forgot redirected route and start to send redirects again.
1468 * This algorithm is much cheaper and more intelligent than dumb load limiting
1471 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1472 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1475 void ip_rt_send_redirect(struct sk_buff *skb)
1477 struct rtable *rt = skb_rtable(skb);
1478 struct in_device *in_dev;
1479 struct inet_peer *peer;
1483 in_dev = __in_dev_get_rcu(rt->dst.dev);
1484 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1488 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1492 rt_bind_peer(rt, rt->rt_dst, 1);
1495 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1499 /* No redirected packets during ip_rt_redirect_silence;
1500 * reset the algorithm.
1502 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1503 peer->rate_tokens = 0;
1505 /* Too many ignored redirects; do not send anything
1506 * set dst.rate_last to the last seen redirected packet.
1508 if (peer->rate_tokens >= ip_rt_redirect_number) {
1509 peer->rate_last = jiffies;
1513 /* Check for load limit; set rate_last to the latest sent
1516 if (peer->rate_tokens == 0 ||
1519 (ip_rt_redirect_load << peer->rate_tokens)))) {
1520 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1521 peer->rate_last = jiffies;
1522 ++peer->rate_tokens;
1523 #ifdef CONFIG_IP_ROUTE_VERBOSE
1525 peer->rate_tokens == ip_rt_redirect_number &&
1527 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1528 &ip_hdr(skb)->saddr, rt->rt_iif,
1529 &rt->rt_dst, &rt->rt_gateway);
1534 static int ip_error(struct sk_buff *skb)
1536 struct rtable *rt = skb_rtable(skb);
1537 struct inet_peer *peer;
1542 switch (rt->dst.error) {
1547 code = ICMP_HOST_UNREACH;
1550 code = ICMP_NET_UNREACH;
1551 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1552 IPSTATS_MIB_INNOROUTES);
1555 code = ICMP_PKT_FILTERED;
1560 rt_bind_peer(rt, rt->rt_dst, 1);
1566 peer->rate_tokens += now - peer->rate_last;
1567 if (peer->rate_tokens > ip_rt_error_burst)
1568 peer->rate_tokens = ip_rt_error_burst;
1569 peer->rate_last = now;
1570 if (peer->rate_tokens >= ip_rt_error_cost)
1571 peer->rate_tokens -= ip_rt_error_cost;
1576 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1578 out: kfree_skb(skb);
1583 * The last two values are not from the RFC but
1584 * are needed for AMPRnet AX.25 paths.
1587 static const unsigned short mtu_plateau[] =
1588 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1590 static inline unsigned short guess_mtu(unsigned short old_mtu)
1594 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1595 if (old_mtu > mtu_plateau[i])
1596 return mtu_plateau[i];
1600 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1601 unsigned short new_mtu,
1602 struct net_device *dev)
1604 unsigned short old_mtu = ntohs(iph->tot_len);
1605 unsigned short est_mtu = 0;
1606 struct inet_peer *peer;
1608 peer = inet_getpeer_v4(iph->daddr, 1);
1610 unsigned short mtu = new_mtu;
1612 if (new_mtu < 68 || new_mtu >= old_mtu) {
1613 /* BSD 4.2 derived systems incorrectly adjust
1614 * tot_len by the IP header length, and report
1615 * a zero MTU in the ICMP message.
1618 old_mtu >= 68 + (iph->ihl << 2))
1619 old_mtu -= iph->ihl << 2;
1620 mtu = guess_mtu(old_mtu);
1623 if (mtu < ip_rt_min_pmtu)
1624 mtu = ip_rt_min_pmtu;
1625 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1626 unsigned long pmtu_expires;
1628 pmtu_expires = jiffies + ip_rt_mtu_expires;
1633 peer->pmtu_learned = mtu;
1634 peer->pmtu_expires = pmtu_expires;
1639 atomic_inc(&__rt_peer_genid);
1641 return est_mtu ? : new_mtu;
1644 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1646 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1650 if (time_before(jiffies, expires)) {
1651 u32 orig_dst_mtu = dst_mtu(dst);
1652 if (peer->pmtu_learned < orig_dst_mtu) {
1653 if (!peer->pmtu_orig)
1654 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1655 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1657 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1658 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1661 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1663 struct rtable *rt = (struct rtable *) dst;
1664 struct inet_peer *peer;
1669 rt_bind_peer(rt, rt->rt_dst, 1);
1672 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1674 if (mtu < ip_rt_min_pmtu)
1675 mtu = ip_rt_min_pmtu;
1676 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1678 pmtu_expires = jiffies + ip_rt_mtu_expires;
1682 peer->pmtu_learned = mtu;
1683 peer->pmtu_expires = pmtu_expires;
1685 atomic_inc(&__rt_peer_genid);
1686 rt->rt_peer_genid = rt_peer_genid();
1688 check_peer_pmtu(dst, peer);
1692 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1694 struct rtable *rt = (struct rtable *) dst;
1695 __be32 orig_gw = rt->rt_gateway;
1696 struct neighbour *n, *old_n;
1698 dst_confirm(&rt->dst);
1700 rt->rt_gateway = peer->redirect_learned.a4;
1701 n = __arp_bind_neighbour(&rt->dst, rt->rt_gateway);
1704 old_n = xchg(&rt->dst._neighbour, n);
1706 neigh_release(old_n);
1707 if (!n || !(n->nud_state & NUD_VALID)) {
1709 neigh_event_send(n, NULL);
1710 rt->rt_gateway = orig_gw;
1713 rt->rt_flags |= RTCF_REDIRECTED;
1714 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1719 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1721 struct rtable *rt = (struct rtable *) dst;
1723 if (rt_is_expired(rt))
1725 if (rt->rt_peer_genid != rt_peer_genid()) {
1726 struct inet_peer *peer;
1729 rt_bind_peer(rt, rt->rt_dst, 0);
1733 check_peer_pmtu(dst, peer);
1735 if (peer->redirect_learned.a4 &&
1736 peer->redirect_learned.a4 != rt->rt_gateway) {
1737 if (check_peer_redir(dst, peer))
1742 rt->rt_peer_genid = rt_peer_genid();
1747 static void ipv4_dst_destroy(struct dst_entry *dst)
1749 struct rtable *rt = (struct rtable *) dst;
1750 struct inet_peer *peer = rt->peer;
1753 fib_info_put(rt->fi);
1763 static void ipv4_link_failure(struct sk_buff *skb)
1767 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1769 rt = skb_rtable(skb);
1770 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1771 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1774 static int ip_rt_bug(struct sk_buff *skb)
1776 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1777 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1778 skb->dev ? skb->dev->name : "?");
1785 We do not cache source address of outgoing interface,
1786 because it is used only by IP RR, TS and SRR options,
1787 so that it out of fast path.
1789 BTW remember: "addr" is allowed to be not aligned
1793 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1797 if (rt_is_output_route(rt))
1798 src = ip_hdr(skb)->saddr;
1800 struct fib_result res;
1806 memset(&fl4, 0, sizeof(fl4));
1807 fl4.daddr = iph->daddr;
1808 fl4.saddr = iph->saddr;
1809 fl4.flowi4_tos = RT_TOS(iph->tos);
1810 fl4.flowi4_oif = rt->dst.dev->ifindex;
1811 fl4.flowi4_iif = skb->dev->ifindex;
1812 fl4.flowi4_mark = skb->mark;
1815 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1816 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1818 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1822 memcpy(addr, &src, 4);
1825 #ifdef CONFIG_IP_ROUTE_CLASSID
1826 static void set_class_tag(struct rtable *rt, u32 tag)
1828 if (!(rt->dst.tclassid & 0xFFFF))
1829 rt->dst.tclassid |= tag & 0xFFFF;
1830 if (!(rt->dst.tclassid & 0xFFFF0000))
1831 rt->dst.tclassid |= tag & 0xFFFF0000;
1835 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1837 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1840 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1842 if (advmss > 65535 - 40)
1843 advmss = 65535 - 40;
1848 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1850 unsigned int mtu = dst->dev->mtu;
1852 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1853 const struct rtable *rt = (const struct rtable *) dst;
1855 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1859 if (mtu > IP_MAX_MTU)
1865 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1866 struct fib_info *fi)
1868 struct inet_peer *peer;
1871 /* If a peer entry exists for this destination, we must hook
1872 * it up in order to get at cached metrics.
1874 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1877 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1879 rt->rt_peer_genid = rt_peer_genid();
1880 if (inet_metrics_new(peer))
1881 memcpy(peer->metrics, fi->fib_metrics,
1882 sizeof(u32) * RTAX_MAX);
1883 dst_init_metrics(&rt->dst, peer->metrics, false);
1885 check_peer_pmtu(&rt->dst, peer);
1886 if (peer->redirect_learned.a4 &&
1887 peer->redirect_learned.a4 != rt->rt_gateway) {
1888 rt->rt_gateway = peer->redirect_learned.a4;
1889 rt->rt_flags |= RTCF_REDIRECTED;
1892 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1894 atomic_inc(&fi->fib_clntref);
1896 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1900 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1901 const struct fib_result *res,
1902 struct fib_info *fi, u16 type, u32 itag)
1904 struct dst_entry *dst = &rt->dst;
1907 if (FIB_RES_GW(*res) &&
1908 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1909 rt->rt_gateway = FIB_RES_GW(*res);
1910 rt_init_metrics(rt, fl4, fi);
1911 #ifdef CONFIG_IP_ROUTE_CLASSID
1912 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1916 if (dst_mtu(dst) > IP_MAX_MTU)
1917 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1918 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1919 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1921 #ifdef CONFIG_IP_ROUTE_CLASSID
1922 #ifdef CONFIG_IP_MULTIPLE_TABLES
1923 set_class_tag(rt, fib_rules_tclass(res));
1925 set_class_tag(rt, itag);
1929 static struct rtable *rt_dst_alloc(struct net_device *dev,
1930 bool nopolicy, bool noxfrm)
1932 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1934 (nopolicy ? DST_NOPOLICY : 0) |
1935 (noxfrm ? DST_NOXFRM : 0));
1938 /* called in rcu_read_lock() section */
1939 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1940 u8 tos, struct net_device *dev, int our)
1945 struct in_device *in_dev = __in_dev_get_rcu(dev);
1949 /* Primary sanity checks. */
1954 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1955 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1958 if (ipv4_is_zeronet(saddr)) {
1959 if (!ipv4_is_local_multicast(daddr))
1961 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1963 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1968 rth = rt_dst_alloc(init_net.loopback_dev,
1969 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1973 #ifdef CONFIG_IP_ROUTE_CLASSID
1974 rth->dst.tclassid = itag;
1976 rth->dst.output = ip_rt_bug;
1978 rth->rt_key_dst = daddr;
1979 rth->rt_key_src = saddr;
1980 rth->rt_genid = rt_genid(dev_net(dev));
1981 rth->rt_flags = RTCF_MULTICAST;
1982 rth->rt_type = RTN_MULTICAST;
1983 rth->rt_key_tos = tos;
1984 rth->rt_dst = daddr;
1985 rth->rt_src = saddr;
1986 rth->rt_route_iif = dev->ifindex;
1987 rth->rt_iif = dev->ifindex;
1989 rth->rt_mark = skb->mark;
1990 rth->rt_gateway = daddr;
1991 rth->rt_spec_dst= spec_dst;
1992 rth->rt_peer_genid = 0;
1996 rth->dst.input= ip_local_deliver;
1997 rth->rt_flags |= RTCF_LOCAL;
2000 #ifdef CONFIG_IP_MROUTE
2001 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2002 rth->dst.input = ip_mr_input;
2004 RT_CACHE_STAT_INC(in_slow_mc);
2006 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2007 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2008 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2019 static void ip_handle_martian_source(struct net_device *dev,
2020 struct in_device *in_dev,
2021 struct sk_buff *skb,
2025 RT_CACHE_STAT_INC(in_martian_src);
2026 #ifdef CONFIG_IP_ROUTE_VERBOSE
2027 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2029 * RFC1812 recommendation, if source is martian,
2030 * the only hint is MAC header.
2032 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2033 &daddr, &saddr, dev->name);
2034 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2036 const unsigned char *p = skb_mac_header(skb);
2037 printk(KERN_WARNING "ll header: ");
2038 for (i = 0; i < dev->hard_header_len; i++, p++) {
2040 if (i < (dev->hard_header_len - 1))
2049 /* called in rcu_read_lock() section */
2050 static int __mkroute_input(struct sk_buff *skb,
2051 const struct fib_result *res,
2052 struct in_device *in_dev,
2053 __be32 daddr, __be32 saddr, u32 tos,
2054 struct rtable **result)
2058 struct in_device *out_dev;
2059 unsigned int flags = 0;
2063 /* get a working reference to the output device */
2064 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2065 if (out_dev == NULL) {
2066 if (net_ratelimit())
2067 printk(KERN_CRIT "Bug in ip_route_input" \
2068 "_slow(). Please, report\n");
2073 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2074 in_dev->dev, &spec_dst, &itag);
2076 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2083 flags |= RTCF_DIRECTSRC;
2085 if (out_dev == in_dev && err &&
2086 (IN_DEV_SHARED_MEDIA(out_dev) ||
2087 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2088 flags |= RTCF_DOREDIRECT;
2090 if (skb->protocol != htons(ETH_P_IP)) {
2091 /* Not IP (i.e. ARP). Do not create route, if it is
2092 * invalid for proxy arp. DNAT routes are always valid.
2094 * Proxy arp feature have been extended to allow, ARP
2095 * replies back to the same interface, to support
2096 * Private VLAN switch technologies. See arp.c.
2098 if (out_dev == in_dev &&
2099 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2105 rth = rt_dst_alloc(out_dev->dev,
2106 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2107 IN_DEV_CONF_GET(out_dev, NOXFRM));
2113 rth->rt_key_dst = daddr;
2114 rth->rt_key_src = saddr;
2115 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2116 rth->rt_flags = flags;
2117 rth->rt_type = res->type;
2118 rth->rt_key_tos = tos;
2119 rth->rt_dst = daddr;
2120 rth->rt_src = saddr;
2121 rth->rt_route_iif = in_dev->dev->ifindex;
2122 rth->rt_iif = in_dev->dev->ifindex;
2124 rth->rt_mark = skb->mark;
2125 rth->rt_gateway = daddr;
2126 rth->rt_spec_dst= spec_dst;
2127 rth->rt_peer_genid = 0;
2131 rth->dst.input = ip_forward;
2132 rth->dst.output = ip_output;
2134 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2142 static int ip_mkroute_input(struct sk_buff *skb,
2143 struct fib_result *res,
2144 const struct flowi4 *fl4,
2145 struct in_device *in_dev,
2146 __be32 daddr, __be32 saddr, u32 tos)
2148 struct rtable* rth = NULL;
2152 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2153 if (res->fi && res->fi->fib_nhs > 1)
2154 fib_select_multipath(res);
2157 /* create a routing cache entry */
2158 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2162 /* put it into the cache */
2163 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2164 rt_genid(dev_net(rth->dst.dev)));
2165 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2167 return PTR_ERR(rth);
2172 * NOTE. We drop all the packets that has local source
2173 * addresses, because every properly looped back packet
2174 * must have correct destination already attached by output routine.
2176 * Such approach solves two big problems:
2177 * 1. Not simplex devices are handled properly.
2178 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2179 * called with rcu_read_lock()
2182 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2183 u8 tos, struct net_device *dev)
2185 struct fib_result res;
2186 struct in_device *in_dev = __in_dev_get_rcu(dev);
2190 struct rtable * rth;
2194 struct net * net = dev_net(dev);
2196 /* IP on this device is disabled. */
2201 /* Check for the most weird martians, which can be not detected
2205 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2206 ipv4_is_loopback(saddr))
2207 goto martian_source;
2209 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2212 /* Accept zero addresses only to limited broadcast;
2213 * I even do not know to fix it or not. Waiting for complains :-)
2215 if (ipv4_is_zeronet(saddr))
2216 goto martian_source;
2218 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2219 goto martian_destination;
2222 * Now we are ready to route packet.
2225 fl4.flowi4_iif = dev->ifindex;
2226 fl4.flowi4_mark = skb->mark;
2227 fl4.flowi4_tos = tos;
2228 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2231 err = fib_lookup(net, &fl4, &res);
2233 if (!IN_DEV_FORWARD(in_dev))
2238 RT_CACHE_STAT_INC(in_slow_tot);
2240 if (res.type == RTN_BROADCAST)
2243 if (res.type == RTN_LOCAL) {
2244 err = fib_validate_source(skb, saddr, daddr, tos,
2245 net->loopback_dev->ifindex,
2246 dev, &spec_dst, &itag);
2248 goto martian_source_keep_err;
2250 flags |= RTCF_DIRECTSRC;
2255 if (!IN_DEV_FORWARD(in_dev))
2257 if (res.type != RTN_UNICAST)
2258 goto martian_destination;
2260 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2264 if (skb->protocol != htons(ETH_P_IP))
2267 if (ipv4_is_zeronet(saddr))
2268 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2270 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2273 goto martian_source_keep_err;
2275 flags |= RTCF_DIRECTSRC;
2277 flags |= RTCF_BROADCAST;
2278 res.type = RTN_BROADCAST;
2279 RT_CACHE_STAT_INC(in_brd);
2282 rth = rt_dst_alloc(net->loopback_dev,
2283 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2287 rth->dst.input= ip_local_deliver;
2288 rth->dst.output= ip_rt_bug;
2289 #ifdef CONFIG_IP_ROUTE_CLASSID
2290 rth->dst.tclassid = itag;
2293 rth->rt_key_dst = daddr;
2294 rth->rt_key_src = saddr;
2295 rth->rt_genid = rt_genid(net);
2296 rth->rt_flags = flags|RTCF_LOCAL;
2297 rth->rt_type = res.type;
2298 rth->rt_key_tos = tos;
2299 rth->rt_dst = daddr;
2300 rth->rt_src = saddr;
2301 #ifdef CONFIG_IP_ROUTE_CLASSID
2302 rth->dst.tclassid = itag;
2304 rth->rt_route_iif = dev->ifindex;
2305 rth->rt_iif = dev->ifindex;
2307 rth->rt_mark = skb->mark;
2308 rth->rt_gateway = daddr;
2309 rth->rt_spec_dst= spec_dst;
2310 rth->rt_peer_genid = 0;
2313 if (res.type == RTN_UNREACHABLE) {
2314 rth->dst.input= ip_error;
2315 rth->dst.error= -err;
2316 rth->rt_flags &= ~RTCF_LOCAL;
2318 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2319 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2326 RT_CACHE_STAT_INC(in_no_route);
2327 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2328 res.type = RTN_UNREACHABLE;
2334 * Do not cache martian addresses: they should be logged (RFC1812)
2336 martian_destination:
2337 RT_CACHE_STAT_INC(in_martian_dst);
2338 #ifdef CONFIG_IP_ROUTE_VERBOSE
2339 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2340 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2341 &daddr, &saddr, dev->name);
2345 err = -EHOSTUNREACH;
2358 martian_source_keep_err:
2359 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2363 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2364 u8 tos, struct net_device *dev, bool noref)
2366 struct rtable * rth;
2368 int iif = dev->ifindex;
2376 if (!rt_caching(net))
2379 tos &= IPTOS_RT_MASK;
2380 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2382 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2383 rth = rcu_dereference(rth->dst.rt_next)) {
2384 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2385 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2386 (rth->rt_route_iif ^ iif) |
2387 (rth->rt_key_tos ^ tos)) == 0 &&
2388 rth->rt_mark == skb->mark &&
2389 net_eq(dev_net(rth->dst.dev), net) &&
2390 !rt_is_expired(rth)) {
2392 dst_use_noref(&rth->dst, jiffies);
2393 skb_dst_set_noref(skb, &rth->dst);
2395 dst_use(&rth->dst, jiffies);
2396 skb_dst_set(skb, &rth->dst);
2398 RT_CACHE_STAT_INC(in_hit);
2402 RT_CACHE_STAT_INC(in_hlist_search);
2406 /* Multicast recognition logic is moved from route cache to here.
2407 The problem was that too many Ethernet cards have broken/missing
2408 hardware multicast filters :-( As result the host on multicasting
2409 network acquires a lot of useless route cache entries, sort of
2410 SDR messages from all the world. Now we try to get rid of them.
2411 Really, provided software IP multicast filter is organized
2412 reasonably (at least, hashed), it does not result in a slowdown
2413 comparing with route cache reject entries.
2414 Note, that multicast routers are not affected, because
2415 route cache entry is created eventually.
2417 if (ipv4_is_multicast(daddr)) {
2418 struct in_device *in_dev = __in_dev_get_rcu(dev);
2421 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2422 ip_hdr(skb)->protocol);
2424 #ifdef CONFIG_IP_MROUTE
2426 (!ipv4_is_local_multicast(daddr) &&
2427 IN_DEV_MFORWARD(in_dev))
2430 int res = ip_route_input_mc(skb, daddr, saddr,
2439 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2443 EXPORT_SYMBOL(ip_route_input_common);
2445 /* called with rcu_read_lock() */
2446 static struct rtable *__mkroute_output(const struct fib_result *res,
2447 const struct flowi4 *fl4,
2448 __be32 orig_daddr, __be32 orig_saddr,
2449 int orig_oif, struct net_device *dev_out,
2452 struct fib_info *fi = res->fi;
2453 u32 tos = RT_FL_TOS(fl4);
2454 struct in_device *in_dev;
2455 u16 type = res->type;
2458 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2459 return ERR_PTR(-EINVAL);
2461 if (ipv4_is_lbcast(fl4->daddr))
2462 type = RTN_BROADCAST;
2463 else if (ipv4_is_multicast(fl4->daddr))
2464 type = RTN_MULTICAST;
2465 else if (ipv4_is_zeronet(fl4->daddr))
2466 return ERR_PTR(-EINVAL);
2468 if (dev_out->flags & IFF_LOOPBACK)
2469 flags |= RTCF_LOCAL;
2471 in_dev = __in_dev_get_rcu(dev_out);
2473 return ERR_PTR(-EINVAL);
2475 if (type == RTN_BROADCAST) {
2476 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2478 } else if (type == RTN_MULTICAST) {
2479 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2480 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2482 flags &= ~RTCF_LOCAL;
2483 /* If multicast route do not exist use
2484 * default one, but do not gateway in this case.
2487 if (fi && res->prefixlen < 4)
2491 rth = rt_dst_alloc(dev_out,
2492 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2493 IN_DEV_CONF_GET(in_dev, NOXFRM));
2495 return ERR_PTR(-ENOBUFS);
2497 rth->dst.output = ip_output;
2499 rth->rt_key_dst = orig_daddr;
2500 rth->rt_key_src = orig_saddr;
2501 rth->rt_genid = rt_genid(dev_net(dev_out));
2502 rth->rt_flags = flags;
2503 rth->rt_type = type;
2504 rth->rt_key_tos = tos;
2505 rth->rt_dst = fl4->daddr;
2506 rth->rt_src = fl4->saddr;
2507 rth->rt_route_iif = 0;
2508 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2509 rth->rt_oif = orig_oif;
2510 rth->rt_mark = fl4->flowi4_mark;
2511 rth->rt_gateway = fl4->daddr;
2512 rth->rt_spec_dst= fl4->saddr;
2513 rth->rt_peer_genid = 0;
2517 RT_CACHE_STAT_INC(out_slow_tot);
2519 if (flags & RTCF_LOCAL) {
2520 rth->dst.input = ip_local_deliver;
2521 rth->rt_spec_dst = fl4->daddr;
2523 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2524 rth->rt_spec_dst = fl4->saddr;
2525 if (flags & RTCF_LOCAL &&
2526 !(dev_out->flags & IFF_LOOPBACK)) {
2527 rth->dst.output = ip_mc_output;
2528 RT_CACHE_STAT_INC(out_slow_mc);
2530 #ifdef CONFIG_IP_MROUTE
2531 if (type == RTN_MULTICAST) {
2532 if (IN_DEV_MFORWARD(in_dev) &&
2533 !ipv4_is_local_multicast(fl4->daddr)) {
2534 rth->dst.input = ip_mr_input;
2535 rth->dst.output = ip_mc_output;
2541 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2547 * Major route resolver routine.
2548 * called with rcu_read_lock();
2551 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2553 struct net_device *dev_out = NULL;
2554 u32 tos = RT_FL_TOS(fl4);
2555 unsigned int flags = 0;
2556 struct fib_result res;
2563 #ifdef CONFIG_IP_MULTIPLE_TABLES
2567 orig_daddr = fl4->daddr;
2568 orig_saddr = fl4->saddr;
2569 orig_oif = fl4->flowi4_oif;
2571 fl4->flowi4_iif = net->loopback_dev->ifindex;
2572 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2573 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2574 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2578 rth = ERR_PTR(-EINVAL);
2579 if (ipv4_is_multicast(fl4->saddr) ||
2580 ipv4_is_lbcast(fl4->saddr) ||
2581 ipv4_is_zeronet(fl4->saddr))
2584 /* I removed check for oif == dev_out->oif here.
2585 It was wrong for two reasons:
2586 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2587 is assigned to multiple interfaces.
2588 2. Moreover, we are allowed to send packets with saddr
2589 of another iface. --ANK
2592 if (fl4->flowi4_oif == 0 &&
2593 (ipv4_is_multicast(fl4->daddr) ||
2594 ipv4_is_lbcast(fl4->daddr))) {
2595 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2596 dev_out = __ip_dev_find(net, fl4->saddr, false);
2597 if (dev_out == NULL)
2600 /* Special hack: user can direct multicasts
2601 and limited broadcast via necessary interface
2602 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2603 This hack is not just for fun, it allows
2604 vic,vat and friends to work.
2605 They bind socket to loopback, set ttl to zero
2606 and expect that it will work.
2607 From the viewpoint of routing cache they are broken,
2608 because we are not allowed to build multicast path
2609 with loopback source addr (look, routing cache
2610 cannot know, that ttl is zero, so that packet
2611 will not leave this host and route is valid).
2612 Luckily, this hack is good workaround.
2615 fl4->flowi4_oif = dev_out->ifindex;
2619 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2620 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2621 if (!__ip_dev_find(net, fl4->saddr, false))
2627 if (fl4->flowi4_oif) {
2628 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2629 rth = ERR_PTR(-ENODEV);
2630 if (dev_out == NULL)
2633 /* RACE: Check return value of inet_select_addr instead. */
2634 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2635 rth = ERR_PTR(-ENETUNREACH);
2638 if (ipv4_is_local_multicast(fl4->daddr) ||
2639 ipv4_is_lbcast(fl4->daddr)) {
2641 fl4->saddr = inet_select_addr(dev_out, 0,
2646 if (ipv4_is_multicast(fl4->daddr))
2647 fl4->saddr = inet_select_addr(dev_out, 0,
2649 else if (!fl4->daddr)
2650 fl4->saddr = inet_select_addr(dev_out, 0,
2656 fl4->daddr = fl4->saddr;
2658 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2659 dev_out = net->loopback_dev;
2660 fl4->flowi4_oif = net->loopback_dev->ifindex;
2661 res.type = RTN_LOCAL;
2662 flags |= RTCF_LOCAL;
2666 if (fib_lookup(net, fl4, &res)) {
2668 if (fl4->flowi4_oif) {
2669 /* Apparently, routing tables are wrong. Assume,
2670 that the destination is on link.
2673 Because we are allowed to send to iface
2674 even if it has NO routes and NO assigned
2675 addresses. When oif is specified, routing
2676 tables are looked up with only one purpose:
2677 to catch if destination is gatewayed, rather than
2678 direct. Moreover, if MSG_DONTROUTE is set,
2679 we send packet, ignoring both routing tables
2680 and ifaddr state. --ANK
2683 We could make it even if oif is unknown,
2684 likely IPv6, but we do not.
2687 if (fl4->saddr == 0)
2688 fl4->saddr = inet_select_addr(dev_out, 0,
2690 res.type = RTN_UNICAST;
2693 rth = ERR_PTR(-ENETUNREACH);
2697 if (res.type == RTN_LOCAL) {
2699 if (res.fi->fib_prefsrc)
2700 fl4->saddr = res.fi->fib_prefsrc;
2702 fl4->saddr = fl4->daddr;
2704 dev_out = net->loopback_dev;
2705 fl4->flowi4_oif = dev_out->ifindex;
2707 flags |= RTCF_LOCAL;
2711 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2712 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2713 fib_select_multipath(&res);
2716 if (!res.prefixlen &&
2717 res.table->tb_num_default > 1 &&
2718 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2719 fib_select_default(&res);
2722 fl4->saddr = FIB_RES_PREFSRC(net, res);
2724 dev_out = FIB_RES_DEV(res);
2725 fl4->flowi4_oif = dev_out->ifindex;
2729 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2734 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2735 rt_genid(dev_net(dev_out)));
2736 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2744 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2749 if (!rt_caching(net))
2752 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2755 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2756 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2757 if (rth->rt_key_dst == flp4->daddr &&
2758 rth->rt_key_src == flp4->saddr &&
2759 rt_is_output_route(rth) &&
2760 rth->rt_oif == flp4->flowi4_oif &&
2761 rth->rt_mark == flp4->flowi4_mark &&
2762 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2763 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2764 net_eq(dev_net(rth->dst.dev), net) &&
2765 !rt_is_expired(rth)) {
2766 dst_use(&rth->dst, jiffies);
2767 RT_CACHE_STAT_INC(out_hit);
2768 rcu_read_unlock_bh();
2770 flp4->saddr = rth->rt_src;
2772 flp4->daddr = rth->rt_dst;
2775 RT_CACHE_STAT_INC(out_hlist_search);
2777 rcu_read_unlock_bh();
2780 return ip_route_output_slow(net, flp4);
2782 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2784 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2789 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2794 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2798 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2804 static struct dst_ops ipv4_dst_blackhole_ops = {
2806 .protocol = cpu_to_be16(ETH_P_IP),
2807 .destroy = ipv4_dst_destroy,
2808 .check = ipv4_blackhole_dst_check,
2809 .default_mtu = ipv4_blackhole_default_mtu,
2810 .default_advmss = ipv4_default_advmss,
2811 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2812 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2815 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2817 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2818 struct rtable *ort = (struct rtable *) dst_orig;
2821 struct dst_entry *new = &rt->dst;
2824 new->input = dst_discard;
2825 new->output = dst_discard;
2826 dst_copy_metrics(new, &ort->dst);
2828 new->dev = ort->dst.dev;
2832 rt->rt_key_dst = ort->rt_key_dst;
2833 rt->rt_key_src = ort->rt_key_src;
2834 rt->rt_key_tos = ort->rt_key_tos;
2835 rt->rt_route_iif = ort->rt_route_iif;
2836 rt->rt_iif = ort->rt_iif;
2837 rt->rt_oif = ort->rt_oif;
2838 rt->rt_mark = ort->rt_mark;
2840 rt->rt_genid = rt_genid(net);
2841 rt->rt_flags = ort->rt_flags;
2842 rt->rt_type = ort->rt_type;
2843 rt->rt_dst = ort->rt_dst;
2844 rt->rt_src = ort->rt_src;
2845 rt->rt_gateway = ort->rt_gateway;
2846 rt->rt_spec_dst = ort->rt_spec_dst;
2847 rt->peer = ort->peer;
2849 atomic_inc(&rt->peer->refcnt);
2852 atomic_inc(&rt->fi->fib_clntref);
2857 dst_release(dst_orig);
2859 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2862 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2865 struct rtable *rt = __ip_route_output_key(net, flp4);
2870 if (flp4->flowi4_proto)
2871 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2872 flowi4_to_flowi(flp4),
2877 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2879 static int rt_fill_info(struct net *net,
2880 struct sk_buff *skb, u32 pid, u32 seq, int event,
2881 int nowait, unsigned int flags)
2883 struct rtable *rt = skb_rtable(skb);
2885 struct nlmsghdr *nlh;
2887 const struct inet_peer *peer = rt->peer;
2888 u32 id = 0, ts = 0, tsage = 0, error;
2890 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2894 r = nlmsg_data(nlh);
2895 r->rtm_family = AF_INET;
2896 r->rtm_dst_len = 32;
2898 r->rtm_tos = rt->rt_key_tos;
2899 r->rtm_table = RT_TABLE_MAIN;
2900 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2901 r->rtm_type = rt->rt_type;
2902 r->rtm_scope = RT_SCOPE_UNIVERSE;
2903 r->rtm_protocol = RTPROT_UNSPEC;
2904 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2905 if (rt->rt_flags & RTCF_NOTIFY)
2906 r->rtm_flags |= RTM_F_NOTIFY;
2908 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2910 if (rt->rt_key_src) {
2911 r->rtm_src_len = 32;
2912 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2915 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2916 #ifdef CONFIG_IP_ROUTE_CLASSID
2917 if (rt->dst.tclassid)
2918 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2920 if (rt_is_input_route(rt))
2921 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2922 else if (rt->rt_src != rt->rt_key_src)
2923 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2925 if (rt->rt_dst != rt->rt_gateway)
2926 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2928 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2929 goto nla_put_failure;
2932 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2934 error = rt->dst.error;
2936 inet_peer_refcheck(rt->peer);
2937 id = atomic_read(&peer->ip_id_count) & 0xffff;
2938 if (peer->tcp_ts_stamp) {
2940 tsage = get_seconds() - peer->tcp_ts_stamp;
2942 expires = ACCESS_ONCE(peer->pmtu_expires);
2947 if (rt_is_input_route(rt)) {
2948 #ifdef CONFIG_IP_MROUTE
2949 __be32 dst = rt->rt_dst;
2951 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2952 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2953 int err = ipmr_get_route(net, skb,
2954 rt->rt_src, rt->rt_dst,
2960 goto nla_put_failure;
2962 if (err == -EMSGSIZE)
2963 goto nla_put_failure;
2969 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2972 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2973 expires, error) < 0)
2974 goto nla_put_failure;
2976 return nlmsg_end(skb, nlh);
2979 nlmsg_cancel(skb, nlh);
2983 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2985 struct net *net = sock_net(in_skb->sk);
2987 struct nlattr *tb[RTA_MAX+1];
2988 struct rtable *rt = NULL;
2994 struct sk_buff *skb;
2996 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3000 rtm = nlmsg_data(nlh);
3002 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3008 /* Reserve room for dummy headers, this skb can pass
3009 through good chunk of routing engine.
3011 skb_reset_mac_header(skb);
3012 skb_reset_network_header(skb);
3014 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3015 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3016 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3018 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3019 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3020 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3021 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3024 struct net_device *dev;
3026 dev = __dev_get_by_index(net, iif);
3032 skb->protocol = htons(ETH_P_IP);
3036 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3039 rt = skb_rtable(skb);
3040 if (err == 0 && rt->dst.error)
3041 err = -rt->dst.error;
3043 struct flowi4 fl4 = {
3046 .flowi4_tos = rtm->rtm_tos,
3047 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3048 .flowi4_mark = mark,
3050 rt = ip_route_output_key(net, &fl4);
3060 skb_dst_set(skb, &rt->dst);
3061 if (rtm->rtm_flags & RTM_F_NOTIFY)
3062 rt->rt_flags |= RTCF_NOTIFY;
3064 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3065 RTM_NEWROUTE, 0, 0);
3069 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3078 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3085 net = sock_net(skb->sk);
3090 s_idx = idx = cb->args[1];
3091 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3092 if (!rt_hash_table[h].chain)
3095 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3096 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3097 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3099 if (rt_is_expired(rt))
3101 skb_dst_set_noref(skb, &rt->dst);
3102 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3103 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3104 1, NLM_F_MULTI) <= 0) {
3106 rcu_read_unlock_bh();
3111 rcu_read_unlock_bh();
3120 void ip_rt_multicast_event(struct in_device *in_dev)
3122 rt_cache_flush(dev_net(in_dev->dev), 0);
3125 #ifdef CONFIG_SYSCTL
3126 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3127 void __user *buffer,
3128 size_t *lenp, loff_t *ppos)
3135 memcpy(&ctl, __ctl, sizeof(ctl));
3136 ctl.data = &flush_delay;
3137 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3139 net = (struct net *)__ctl->extra1;
3140 rt_cache_flush(net, flush_delay);
3147 static ctl_table ipv4_route_table[] = {
3149 .procname = "gc_thresh",
3150 .data = &ipv4_dst_ops.gc_thresh,
3151 .maxlen = sizeof(int),
3153 .proc_handler = proc_dointvec,
3156 .procname = "max_size",
3157 .data = &ip_rt_max_size,
3158 .maxlen = sizeof(int),
3160 .proc_handler = proc_dointvec,
3163 /* Deprecated. Use gc_min_interval_ms */
3165 .procname = "gc_min_interval",
3166 .data = &ip_rt_gc_min_interval,
3167 .maxlen = sizeof(int),
3169 .proc_handler = proc_dointvec_jiffies,
3172 .procname = "gc_min_interval_ms",
3173 .data = &ip_rt_gc_min_interval,
3174 .maxlen = sizeof(int),
3176 .proc_handler = proc_dointvec_ms_jiffies,
3179 .procname = "gc_timeout",
3180 .data = &ip_rt_gc_timeout,
3181 .maxlen = sizeof(int),
3183 .proc_handler = proc_dointvec_jiffies,
3186 .procname = "gc_interval",
3187 .data = &ip_rt_gc_interval,
3188 .maxlen = sizeof(int),
3190 .proc_handler = proc_dointvec_jiffies,
3193 .procname = "gc_interval",
3194 .data = &ip_rt_gc_interval,
3195 .maxlen = sizeof(int),
3197 .proc_handler = proc_dointvec_jiffies,
3200 .procname = "redirect_load",
3201 .data = &ip_rt_redirect_load,
3202 .maxlen = sizeof(int),
3204 .proc_handler = proc_dointvec,
3207 .procname = "redirect_number",
3208 .data = &ip_rt_redirect_number,
3209 .maxlen = sizeof(int),
3211 .proc_handler = proc_dointvec,
3214 .procname = "redirect_silence",
3215 .data = &ip_rt_redirect_silence,
3216 .maxlen = sizeof(int),
3218 .proc_handler = proc_dointvec,
3221 .procname = "error_cost",
3222 .data = &ip_rt_error_cost,
3223 .maxlen = sizeof(int),
3225 .proc_handler = proc_dointvec,
3228 .procname = "error_burst",
3229 .data = &ip_rt_error_burst,
3230 .maxlen = sizeof(int),
3232 .proc_handler = proc_dointvec,
3235 .procname = "gc_elasticity",
3236 .data = &ip_rt_gc_elasticity,
3237 .maxlen = sizeof(int),
3239 .proc_handler = proc_dointvec,
3242 .procname = "mtu_expires",
3243 .data = &ip_rt_mtu_expires,
3244 .maxlen = sizeof(int),
3246 .proc_handler = proc_dointvec_jiffies,
3249 .procname = "min_pmtu",
3250 .data = &ip_rt_min_pmtu,
3251 .maxlen = sizeof(int),
3253 .proc_handler = proc_dointvec,
3256 .procname = "min_adv_mss",
3257 .data = &ip_rt_min_advmss,
3258 .maxlen = sizeof(int),
3260 .proc_handler = proc_dointvec,
3265 static struct ctl_table empty[1];
3267 static struct ctl_table ipv4_skeleton[] =
3269 { .procname = "route",
3270 .mode = 0555, .child = ipv4_route_table},
3271 { .procname = "neigh",
3272 .mode = 0555, .child = empty},
3276 static __net_initdata struct ctl_path ipv4_path[] = {
3277 { .procname = "net", },
3278 { .procname = "ipv4", },
3282 static struct ctl_table ipv4_route_flush_table[] = {
3284 .procname = "flush",
3285 .maxlen = sizeof(int),
3287 .proc_handler = ipv4_sysctl_rtcache_flush,
3292 static __net_initdata struct ctl_path ipv4_route_path[] = {
3293 { .procname = "net", },
3294 { .procname = "ipv4", },
3295 { .procname = "route", },
3299 static __net_init int sysctl_route_net_init(struct net *net)
3301 struct ctl_table *tbl;
3303 tbl = ipv4_route_flush_table;
3304 if (!net_eq(net, &init_net)) {
3305 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3309 tbl[0].extra1 = net;
3311 net->ipv4.route_hdr =
3312 register_net_sysctl_table(net, ipv4_route_path, tbl);
3313 if (net->ipv4.route_hdr == NULL)
3318 if (tbl != ipv4_route_flush_table)
3324 static __net_exit void sysctl_route_net_exit(struct net *net)
3326 struct ctl_table *tbl;
3328 tbl = net->ipv4.route_hdr->ctl_table_arg;
3329 unregister_net_sysctl_table(net->ipv4.route_hdr);
3330 BUG_ON(tbl == ipv4_route_flush_table);
3334 static __net_initdata struct pernet_operations sysctl_route_ops = {
3335 .init = sysctl_route_net_init,
3336 .exit = sysctl_route_net_exit,
3340 static __net_init int rt_genid_init(struct net *net)
3342 get_random_bytes(&net->ipv4.rt_genid,
3343 sizeof(net->ipv4.rt_genid));
3344 get_random_bytes(&net->ipv4.dev_addr_genid,
3345 sizeof(net->ipv4.dev_addr_genid));
3349 static __net_initdata struct pernet_operations rt_genid_ops = {
3350 .init = rt_genid_init,
3354 #ifdef CONFIG_IP_ROUTE_CLASSID
3355 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3356 #endif /* CONFIG_IP_ROUTE_CLASSID */
3358 static __initdata unsigned long rhash_entries;
3359 static int __init set_rhash_entries(char *str)
3363 rhash_entries = simple_strtoul(str, &str, 0);
3366 __setup("rhash_entries=", set_rhash_entries);
3368 int __init ip_rt_init(void)
3372 #ifdef CONFIG_IP_ROUTE_CLASSID
3373 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3375 panic("IP: failed to allocate ip_rt_acct\n");
3378 ipv4_dst_ops.kmem_cachep =
3379 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3380 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3382 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3384 if (dst_entries_init(&ipv4_dst_ops) < 0)
3385 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3387 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3388 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3390 rt_hash_table = (struct rt_hash_bucket *)
3391 alloc_large_system_hash("IP route cache",
3392 sizeof(struct rt_hash_bucket),
3394 (totalram_pages >= 128 * 1024) ?
3399 rhash_entries ? 0 : 512 * 1024);
3400 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3401 rt_hash_lock_init();
3403 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3404 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3409 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3410 expires_ljiffies = jiffies;
3411 schedule_delayed_work(&expires_work,
3412 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3414 if (ip_rt_proc_init())
3415 printk(KERN_ERR "Unable to create route proc files\n");
3418 xfrm4_init(ip_rt_max_size);
3420 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3422 #ifdef CONFIG_SYSCTL
3423 register_pernet_subsys(&sysctl_route_ops);
3425 register_pernet_subsys(&rt_genid_ops);
3429 #ifdef CONFIG_SYSCTL
3431 * We really need to sanitize the damn ipv4 init order, then all
3432 * this nonsense will go away.
3434 void __init ip_static_sysctl_init(void)
3436 register_sysctl_paths(ipv4_path, ipv4_skeleton);