2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
109 #include <linux/sysctl.h>
111 #include <net/atmclip.h>
112 #include <net/secure_seq.h>
114 #define RT_FL_TOS(oldflp4) \
115 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
117 #define IP_MAX_MTU 0xFFF0
119 #define RT_GC_TIMEOUT (300*HZ)
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly = 9;
125 static int ip_rt_redirect_load __read_mostly = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly = HZ;
128 static int ip_rt_error_burst __read_mostly = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly = 8;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
133 static int rt_chain_length_max __read_mostly = 20;
134 static int redirect_genid;
137 * Interface to generic destination cache.
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int ipv4_mtu(const struct dst_entry *dst);
143 static void ipv4_dst_destroy(struct dst_entry *dst);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void ipv4_link_failure(struct sk_buff *skb);
146 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147 static int rt_garbage_collect(struct dst_ops *ops);
149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156 struct rtable *rt = (struct rtable *) dst;
157 struct inet_peer *peer;
161 rt_bind_peer(rt, rt->rt_dst, 1);
165 u32 *old_p = __DST_METRICS_PTR(old);
166 unsigned long prev, new;
169 if (inet_metrics_new(peer))
170 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
172 new = (unsigned long) p;
173 prev = cmpxchg(&dst->_metrics, old, new);
176 p = __DST_METRICS_PTR(prev);
177 if (prev & DST_METRICS_READ_ONLY)
181 fib_info_put(rt->fi);
189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
191 static struct dst_ops ipv4_dst_ops = {
193 .protocol = cpu_to_be16(ETH_P_IP),
194 .gc = rt_garbage_collect,
195 .check = ipv4_dst_check,
196 .default_advmss = ipv4_default_advmss,
198 .cow_metrics = ipv4_cow_metrics,
199 .destroy = ipv4_dst_destroy,
200 .ifdown = ipv4_dst_ifdown,
201 .negative_advice = ipv4_negative_advice,
202 .link_failure = ipv4_link_failure,
203 .update_pmtu = ip_rt_update_pmtu,
204 .local_out = __ip_local_out,
205 .neigh_lookup = ipv4_neigh_lookup,
208 #define ECN_OR_COST(class) TC_PRIO_##class
210 const __u8 ip_tos2prio[16] = {
212 ECN_OR_COST(BESTEFFORT),
214 ECN_OR_COST(BESTEFFORT),
220 ECN_OR_COST(INTERACTIVE),
222 ECN_OR_COST(INTERACTIVE),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK),
225 TC_PRIO_INTERACTIVE_BULK,
226 ECN_OR_COST(INTERACTIVE_BULK)
234 /* The locking scheme is rather straight forward:
236 * 1) Read-Copy Update protects the buckets of the central route hash.
237 * 2) Only writers remove entries, and they hold the lock
238 * as they look at rtable reference counts.
239 * 3) Only readers acquire references to rtable entries,
240 * they do so with atomic increments and with the
244 struct rt_hash_bucket {
245 struct rtable __rcu *chain;
248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249 defined(CONFIG_PROVE_LOCKING)
251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252 * The size of this table is a power of two and depends on the number of CPUS.
253 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
255 #ifdef CONFIG_LOCKDEP
256 # define RT_HASH_LOCK_SZ 256
259 # define RT_HASH_LOCK_SZ 4096
261 # define RT_HASH_LOCK_SZ 2048
263 # define RT_HASH_LOCK_SZ 1024
265 # define RT_HASH_LOCK_SZ 512
267 # define RT_HASH_LOCK_SZ 256
271 static spinlock_t *rt_hash_locks;
272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
274 static __init void rt_hash_lock_init(void)
278 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
281 panic("IP: failed to allocate rt_hash_locks\n");
283 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
284 spin_lock_init(&rt_hash_locks[i]);
287 # define rt_hash_lock_addr(slot) NULL
289 static inline void rt_hash_lock_init(void)
294 static struct rt_hash_bucket *rt_hash_table __read_mostly;
295 static unsigned rt_hash_mask __read_mostly;
296 static unsigned int rt_hash_log __read_mostly;
298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
304 return jhash_3words((__force u32)daddr, (__force u32)saddr,
309 static inline int rt_genid(struct net *net)
311 return atomic_read(&net->ipv4.rt_genid);
314 #ifdef CONFIG_PROC_FS
315 struct rt_cache_iter_state {
316 struct seq_net_private p;
321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
323 struct rt_cache_iter_state *st = seq->private;
324 struct rtable *r = NULL;
326 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
327 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
330 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
332 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
333 r->rt_genid == st->genid)
335 r = rcu_dereference_bh(r->dst.rt_next);
337 rcu_read_unlock_bh();
342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
345 struct rt_cache_iter_state *st = seq->private;
347 r = rcu_dereference_bh(r->dst.rt_next);
349 rcu_read_unlock_bh();
351 if (--st->bucket < 0)
353 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
355 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
363 struct rt_cache_iter_state *st = seq->private;
364 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
365 if (dev_net(r->dst.dev) != seq_file_net(seq))
367 if (r->rt_genid == st->genid)
373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
375 struct rtable *r = rt_cache_get_first(seq);
378 while (pos && (r = rt_cache_get_next(seq, r)))
380 return pos ? NULL : r;
383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
385 struct rt_cache_iter_state *st = seq->private;
387 return rt_cache_get_idx(seq, *pos - 1);
388 st->genid = rt_genid(seq_file_net(seq));
389 return SEQ_START_TOKEN;
392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
396 if (v == SEQ_START_TOKEN)
397 r = rt_cache_get_first(seq);
399 r = rt_cache_get_next(seq, v);
404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
406 if (v && v != SEQ_START_TOKEN)
407 rcu_read_unlock_bh();
410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
412 if (v == SEQ_START_TOKEN)
413 seq_printf(seq, "%-127s\n",
414 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
418 struct rtable *r = v;
422 n = dst_get_neighbour(&r->dst);
423 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
424 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
425 r->dst.dev ? r->dst.dev->name : "*",
426 (__force u32)r->rt_dst,
427 (__force u32)r->rt_gateway,
428 r->rt_flags, atomic_read(&r->dst.__refcnt),
429 r->dst.__use, 0, (__force u32)r->rt_src,
430 dst_metric_advmss(&r->dst) + 40,
431 dst_metric(&r->dst, RTAX_WINDOW),
432 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
433 dst_metric(&r->dst, RTAX_RTTVAR)),
436 (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
437 r->rt_spec_dst, &len);
439 seq_printf(seq, "%*s\n", 127 - len, "");
444 static const struct seq_operations rt_cache_seq_ops = {
445 .start = rt_cache_seq_start,
446 .next = rt_cache_seq_next,
447 .stop = rt_cache_seq_stop,
448 .show = rt_cache_seq_show,
451 static int rt_cache_seq_open(struct inode *inode, struct file *file)
453 return seq_open_net(inode, file, &rt_cache_seq_ops,
454 sizeof(struct rt_cache_iter_state));
457 static const struct file_operations rt_cache_seq_fops = {
458 .owner = THIS_MODULE,
459 .open = rt_cache_seq_open,
462 .release = seq_release_net,
466 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
471 return SEQ_START_TOKEN;
473 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
474 if (!cpu_possible(cpu))
477 return &per_cpu(rt_cache_stat, cpu);
482 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
486 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
487 if (!cpu_possible(cpu))
490 return &per_cpu(rt_cache_stat, cpu);
496 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
501 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
503 struct rt_cache_stat *st = v;
505 if (v == SEQ_START_TOKEN) {
506 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
510 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
511 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
512 dst_entries_get_slow(&ipv4_dst_ops),
535 static const struct seq_operations rt_cpu_seq_ops = {
536 .start = rt_cpu_seq_start,
537 .next = rt_cpu_seq_next,
538 .stop = rt_cpu_seq_stop,
539 .show = rt_cpu_seq_show,
543 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
545 return seq_open(file, &rt_cpu_seq_ops);
548 static const struct file_operations rt_cpu_seq_fops = {
549 .owner = THIS_MODULE,
550 .open = rt_cpu_seq_open,
553 .release = seq_release,
556 #ifdef CONFIG_IP_ROUTE_CLASSID
557 static int rt_acct_proc_show(struct seq_file *m, void *v)
559 struct ip_rt_acct *dst, *src;
562 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
566 for_each_possible_cpu(i) {
567 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
568 for (j = 0; j < 256; j++) {
569 dst[j].o_bytes += src[j].o_bytes;
570 dst[j].o_packets += src[j].o_packets;
571 dst[j].i_bytes += src[j].i_bytes;
572 dst[j].i_packets += src[j].i_packets;
576 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
581 static int rt_acct_proc_open(struct inode *inode, struct file *file)
583 return single_open(file, rt_acct_proc_show, NULL);
586 static const struct file_operations rt_acct_proc_fops = {
587 .owner = THIS_MODULE,
588 .open = rt_acct_proc_open,
591 .release = single_release,
595 static int __net_init ip_rt_do_proc_init(struct net *net)
597 struct proc_dir_entry *pde;
599 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
604 pde = proc_create("rt_cache", S_IRUGO,
605 net->proc_net_stat, &rt_cpu_seq_fops);
609 #ifdef CONFIG_IP_ROUTE_CLASSID
610 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
616 #ifdef CONFIG_IP_ROUTE_CLASSID
618 remove_proc_entry("rt_cache", net->proc_net_stat);
621 remove_proc_entry("rt_cache", net->proc_net);
626 static void __net_exit ip_rt_do_proc_exit(struct net *net)
628 remove_proc_entry("rt_cache", net->proc_net_stat);
629 remove_proc_entry("rt_cache", net->proc_net);
630 #ifdef CONFIG_IP_ROUTE_CLASSID
631 remove_proc_entry("rt_acct", net->proc_net);
635 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
636 .init = ip_rt_do_proc_init,
637 .exit = ip_rt_do_proc_exit,
640 static int __init ip_rt_proc_init(void)
642 return register_pernet_subsys(&ip_rt_proc_ops);
646 static inline int ip_rt_proc_init(void)
650 #endif /* CONFIG_PROC_FS */
652 static inline void rt_free(struct rtable *rt)
654 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
657 static inline void rt_drop(struct rtable *rt)
660 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
663 static inline int rt_fast_clean(struct rtable *rth)
665 /* Kill broadcast/multicast entries very aggresively, if they
666 collide in hash table with more useful entries */
667 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
668 rt_is_input_route(rth) && rth->dst.rt_next;
671 static inline int rt_valuable(struct rtable *rth)
673 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
674 (rth->peer && rth->peer->pmtu_expires);
677 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
682 if (atomic_read(&rth->dst.__refcnt))
685 age = jiffies - rth->dst.lastuse;
686 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
687 (age <= tmo2 && rt_valuable(rth)))
693 /* Bits of score are:
695 * 30: not quite useless
696 * 29..0: usage counter
698 static inline u32 rt_score(struct rtable *rt)
700 u32 score = jiffies - rt->dst.lastuse;
702 score = ~score & ~(3<<30);
707 if (rt_is_output_route(rt) ||
708 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
714 static inline bool rt_caching(const struct net *net)
716 return net->ipv4.current_rt_cache_rebuild_count <=
717 net->ipv4.sysctl_rt_cache_rebuild_count;
720 static inline bool compare_hash_inputs(const struct rtable *rt1,
721 const struct rtable *rt2)
723 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
724 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
725 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
728 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
730 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
732 (rt1->rt_mark ^ rt2->rt_mark) |
733 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
734 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
735 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
738 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
740 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
743 static inline int rt_is_expired(struct rtable *rth)
745 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
749 * Perform a full scan of hash table and free all entries.
750 * Can be called by a softirq or a process.
751 * In the later case, we want to be reschedule if necessary
753 static void rt_do_flush(struct net *net, int process_context)
756 struct rtable *rth, *next;
758 for (i = 0; i <= rt_hash_mask; i++) {
759 struct rtable __rcu **pprev;
762 if (process_context && need_resched())
764 rth = rcu_access_pointer(rt_hash_table[i].chain);
768 spin_lock_bh(rt_hash_lock_addr(i));
771 pprev = &rt_hash_table[i].chain;
772 rth = rcu_dereference_protected(*pprev,
773 lockdep_is_held(rt_hash_lock_addr(i)));
776 next = rcu_dereference_protected(rth->dst.rt_next,
777 lockdep_is_held(rt_hash_lock_addr(i)));
780 net_eq(dev_net(rth->dst.dev), net)) {
781 rcu_assign_pointer(*pprev, next);
782 rcu_assign_pointer(rth->dst.rt_next, list);
785 pprev = &rth->dst.rt_next;
790 spin_unlock_bh(rt_hash_lock_addr(i));
792 for (; list; list = next) {
793 next = rcu_dereference_protected(list->dst.rt_next, 1);
800 * While freeing expired entries, we compute average chain length
801 * and standard deviation, using fixed-point arithmetic.
802 * This to have an estimation of rt_chain_length_max
803 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
804 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
808 #define ONE (1UL << FRACT_BITS)
811 * Given a hash chain and an item in this hash chain,
812 * find if a previous entry has the same hash_inputs
813 * (but differs on tos, mark or oif)
814 * Returns 0 if an alias is found.
815 * Returns ONE if rth has no alias before itself.
817 static int has_noalias(const struct rtable *head, const struct rtable *rth)
819 const struct rtable *aux = head;
822 if (compare_hash_inputs(aux, rth))
824 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
830 * Perturbation of rt_genid by a small quantity [1..256]
831 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
832 * many times (2^24) without giving recent rt_genid.
833 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
835 static void rt_cache_invalidate(struct net *net)
837 unsigned char shuffle;
839 get_random_bytes(&shuffle, sizeof(shuffle));
840 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
845 * delay < 0 : invalidate cache (fast : entries will be deleted later)
846 * delay >= 0 : invalidate & flush cache (can be long)
848 void rt_cache_flush(struct net *net, int delay)
850 rt_cache_invalidate(net);
852 rt_do_flush(net, !in_softirq());
855 /* Flush previous cache invalidated entries from the cache */
856 void rt_cache_flush_batch(struct net *net)
858 rt_do_flush(net, !in_softirq());
861 static void rt_emergency_hash_rebuild(struct net *net)
864 printk(KERN_WARNING "Route hash chain too long!\n");
865 rt_cache_invalidate(net);
869 Short description of GC goals.
871 We want to build algorithm, which will keep routing cache
872 at some equilibrium point, when number of aged off entries
873 is kept approximately equal to newly generated ones.
875 Current expiration strength is variable "expire".
876 We try to adjust it dynamically, so that if networking
877 is idle expires is large enough to keep enough of warm entries,
878 and when load increases it reduces to limit cache size.
881 static int rt_garbage_collect(struct dst_ops *ops)
883 static unsigned long expire = RT_GC_TIMEOUT;
884 static unsigned long last_gc;
886 static int equilibrium;
888 struct rtable __rcu **rthp;
889 unsigned long now = jiffies;
891 int entries = dst_entries_get_fast(&ipv4_dst_ops);
894 * Garbage collection is pretty expensive,
895 * do not make it too frequently.
898 RT_CACHE_STAT_INC(gc_total);
900 if (now - last_gc < ip_rt_gc_min_interval &&
901 entries < ip_rt_max_size) {
902 RT_CACHE_STAT_INC(gc_ignored);
906 entries = dst_entries_get_slow(&ipv4_dst_ops);
907 /* Calculate number of entries, which we want to expire now. */
908 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
910 if (equilibrium < ipv4_dst_ops.gc_thresh)
911 equilibrium = ipv4_dst_ops.gc_thresh;
912 goal = entries - equilibrium;
914 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
915 goal = entries - equilibrium;
918 /* We are in dangerous area. Try to reduce cache really
921 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
922 equilibrium = entries - goal;
925 if (now - last_gc >= ip_rt_gc_min_interval)
936 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
937 unsigned long tmo = expire;
939 k = (k + 1) & rt_hash_mask;
940 rthp = &rt_hash_table[k].chain;
941 spin_lock_bh(rt_hash_lock_addr(k));
942 while ((rth = rcu_dereference_protected(*rthp,
943 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
944 if (!rt_is_expired(rth) &&
945 !rt_may_expire(rth, tmo, expire)) {
947 rthp = &rth->dst.rt_next;
950 *rthp = rth->dst.rt_next;
954 spin_unlock_bh(rt_hash_lock_addr(k));
963 /* Goal is not achieved. We stop process if:
965 - if expire reduced to zero. Otherwise, expire is halfed.
966 - if table is not full.
967 - if we are called from interrupt.
968 - jiffies check is just fallback/debug loop breaker.
969 We will not spin here for long time in any case.
972 RT_CACHE_STAT_INC(gc_goal_miss);
979 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
981 } while (!in_softirq() && time_before_eq(jiffies, now));
983 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
985 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
988 printk(KERN_WARNING "dst cache overflow\n");
989 RT_CACHE_STAT_INC(gc_dst_overflow);
993 expire += ip_rt_gc_min_interval;
994 if (expire > ip_rt_gc_timeout ||
995 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
996 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
997 expire = ip_rt_gc_timeout;
1002 * Returns number of entries in a hash chain that have different hash_inputs
1004 static int slow_chain_length(const struct rtable *head)
1007 const struct rtable *rth = head;
1010 length += has_noalias(head, rth);
1011 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1013 return length >> FRACT_BITS;
1016 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1018 struct neigh_table *tbl = &arp_tbl;
1019 static const __be32 inaddr_any = 0;
1020 struct net_device *dev = dst->dev;
1021 const __be32 *pkey = daddr;
1022 struct neighbour *n;
1024 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1025 if (dev->type == ARPHRD_ATM)
1026 tbl = clip_tbl_hook;
1028 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1031 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1034 return neigh_create(tbl, pkey, dev);
1037 static int rt_bind_neighbour(struct rtable *rt)
1039 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1042 dst_set_neighbour(&rt->dst, n);
1047 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1048 struct sk_buff *skb, int ifindex)
1050 struct rtable *rth, *cand;
1051 struct rtable __rcu **rthp, **candp;
1055 int attempts = !in_softirq();
1059 min_score = ~(u32)0;
1064 if (!rt_caching(dev_net(rt->dst.dev))) {
1066 * If we're not caching, just tell the caller we
1067 * were successful and don't touch the route. The
1068 * caller hold the sole reference to the cache entry, and
1069 * it will be released when the caller is done with it.
1070 * If we drop it here, the callers have no way to resolve routes
1071 * when we're not caching. Instead, just point *rp at rt, so
1072 * the caller gets a single use out of the route
1073 * Note that we do rt_free on this new route entry, so that
1074 * once its refcount hits zero, we are still able to reap it
1076 * Note: To avoid expensive rcu stuff for this uncached dst,
1077 * we set DST_NOCACHE so that dst_release() can free dst without
1078 * waiting a grace period.
1081 rt->dst.flags |= DST_NOCACHE;
1082 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1083 int err = rt_bind_neighbour(rt);
1085 if (net_ratelimit())
1087 "Neighbour table failure & not caching routes.\n");
1089 return ERR_PTR(err);
1096 rthp = &rt_hash_table[hash].chain;
1098 spin_lock_bh(rt_hash_lock_addr(hash));
1099 while ((rth = rcu_dereference_protected(*rthp,
1100 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1101 if (rt_is_expired(rth)) {
1102 *rthp = rth->dst.rt_next;
1106 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1108 *rthp = rth->dst.rt_next;
1110 * Since lookup is lockfree, the deletion
1111 * must be visible to another weakly ordered CPU before
1112 * the insertion at the start of the hash chain.
1114 rcu_assign_pointer(rth->dst.rt_next,
1115 rt_hash_table[hash].chain);
1117 * Since lookup is lockfree, the update writes
1118 * must be ordered for consistency on SMP.
1120 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1122 dst_use(&rth->dst, now);
1123 spin_unlock_bh(rt_hash_lock_addr(hash));
1127 skb_dst_set(skb, &rth->dst);
1131 if (!atomic_read(&rth->dst.__refcnt)) {
1132 u32 score = rt_score(rth);
1134 if (score <= min_score) {
1143 rthp = &rth->dst.rt_next;
1147 /* ip_rt_gc_elasticity used to be average length of chain
1148 * length, when exceeded gc becomes really aggressive.
1150 * The second limit is less certain. At the moment it allows
1151 * only 2 entries per bucket. We will see.
1153 if (chain_length > ip_rt_gc_elasticity) {
1154 *candp = cand->dst.rt_next;
1158 if (chain_length > rt_chain_length_max &&
1159 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1160 struct net *net = dev_net(rt->dst.dev);
1161 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1162 if (!rt_caching(net)) {
1163 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1164 rt->dst.dev->name, num);
1166 rt_emergency_hash_rebuild(net);
1167 spin_unlock_bh(rt_hash_lock_addr(hash));
1169 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1170 ifindex, rt_genid(net));
1175 /* Try to bind route to arp only if it is output
1176 route or unicast forwarding path.
1178 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1179 int err = rt_bind_neighbour(rt);
1181 spin_unlock_bh(rt_hash_lock_addr(hash));
1183 if (err != -ENOBUFS) {
1185 return ERR_PTR(err);
1188 /* Neighbour tables are full and nothing
1189 can be released. Try to shrink route cache,
1190 it is most likely it holds some neighbour records.
1192 if (attempts-- > 0) {
1193 int saved_elasticity = ip_rt_gc_elasticity;
1194 int saved_int = ip_rt_gc_min_interval;
1195 ip_rt_gc_elasticity = 1;
1196 ip_rt_gc_min_interval = 0;
1197 rt_garbage_collect(&ipv4_dst_ops);
1198 ip_rt_gc_min_interval = saved_int;
1199 ip_rt_gc_elasticity = saved_elasticity;
1203 if (net_ratelimit())
1204 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1206 return ERR_PTR(-ENOBUFS);
1210 rt->dst.rt_next = rt_hash_table[hash].chain;
1213 * Since lookup is lockfree, we must make sure
1214 * previous writes to rt are committed to memory
1215 * before making rt visible to other CPUS.
1217 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1219 spin_unlock_bh(rt_hash_lock_addr(hash));
1223 skb_dst_set(skb, &rt->dst);
1227 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1229 static u32 rt_peer_genid(void)
1231 return atomic_read(&__rt_peer_genid);
1234 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1236 struct inet_peer *peer;
1238 peer = inet_getpeer_v4(daddr, create);
1240 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1243 rt->rt_peer_genid = rt_peer_genid();
1247 * Peer allocation may fail only in serious out-of-memory conditions. However
1248 * we still can generate some output.
1249 * Random ID selection looks a bit dangerous because we have no chances to
1250 * select ID being unique in a reasonable period of time.
1251 * But broken packet identifier may be better than no packet at all.
1253 static void ip_select_fb_ident(struct iphdr *iph)
1255 static DEFINE_SPINLOCK(ip_fb_id_lock);
1256 static u32 ip_fallback_id;
1259 spin_lock_bh(&ip_fb_id_lock);
1260 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1261 iph->id = htons(salt & 0xFFFF);
1262 ip_fallback_id = salt;
1263 spin_unlock_bh(&ip_fb_id_lock);
1266 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1268 struct rtable *rt = (struct rtable *) dst;
1271 if (rt->peer == NULL)
1272 rt_bind_peer(rt, rt->rt_dst, 1);
1274 /* If peer is attached to destination, it is never detached,
1275 so that we need not to grab a lock to dereference it.
1278 iph->id = htons(inet_getid(rt->peer, more));
1282 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1283 __builtin_return_address(0));
1285 ip_select_fb_ident(iph);
1287 EXPORT_SYMBOL(__ip_select_ident);
1289 static void rt_del(unsigned hash, struct rtable *rt)
1291 struct rtable __rcu **rthp;
1294 rthp = &rt_hash_table[hash].chain;
1295 spin_lock_bh(rt_hash_lock_addr(hash));
1297 while ((aux = rcu_dereference_protected(*rthp,
1298 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1299 if (aux == rt || rt_is_expired(aux)) {
1300 *rthp = aux->dst.rt_next;
1304 rthp = &aux->dst.rt_next;
1306 spin_unlock_bh(rt_hash_lock_addr(hash));
1309 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1311 struct rtable *rt = (struct rtable *) dst;
1312 __be32 orig_gw = rt->rt_gateway;
1313 struct neighbour *n, *old_n;
1315 dst_confirm(&rt->dst);
1317 rt->rt_gateway = peer->redirect_learned.a4;
1319 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1322 old_n = xchg(&rt->dst._neighbour, n);
1324 neigh_release(old_n);
1325 if (!n || !(n->nud_state & NUD_VALID)) {
1327 neigh_event_send(n, NULL);
1328 rt->rt_gateway = orig_gw;
1331 rt->rt_flags |= RTCF_REDIRECTED;
1332 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1337 /* called in rcu_read_lock() section */
1338 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1339 __be32 saddr, struct net_device *dev)
1342 struct in_device *in_dev = __in_dev_get_rcu(dev);
1343 __be32 skeys[2] = { saddr, 0 };
1344 int ikeys[2] = { dev->ifindex, 0 };
1345 struct inet_peer *peer;
1352 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1353 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1354 ipv4_is_zeronet(new_gw))
1355 goto reject_redirect;
1357 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1358 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1359 goto reject_redirect;
1360 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1361 goto reject_redirect;
1363 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1364 goto reject_redirect;
1367 for (s = 0; s < 2; s++) {
1368 for (i = 0; i < 2; i++) {
1370 struct rtable __rcu **rthp;
1373 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1375 rthp = &rt_hash_table[hash].chain;
1377 while ((rt = rcu_dereference(*rthp)) != NULL) {
1378 rthp = &rt->dst.rt_next;
1380 if (rt->rt_key_dst != daddr ||
1381 rt->rt_key_src != skeys[s] ||
1382 rt->rt_oif != ikeys[i] ||
1383 rt_is_input_route(rt) ||
1384 rt_is_expired(rt) ||
1385 !net_eq(dev_net(rt->dst.dev), net) ||
1387 rt->dst.dev != dev ||
1388 rt->rt_gateway != old_gw)
1392 rt_bind_peer(rt, rt->rt_dst, 1);
1396 if (peer->redirect_learned.a4 != new_gw ||
1397 peer->redirect_genid != redirect_genid) {
1398 peer->redirect_learned.a4 = new_gw;
1399 peer->redirect_genid = redirect_genid;
1400 atomic_inc(&__rt_peer_genid);
1402 check_peer_redir(&rt->dst, peer);
1410 #ifdef CONFIG_IP_ROUTE_VERBOSE
1411 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1412 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1413 " Advised path = %pI4 -> %pI4\n",
1414 &old_gw, dev->name, &new_gw,
1420 static bool peer_pmtu_expired(struct inet_peer *peer)
1422 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1425 time_after_eq(jiffies, orig) &&
1426 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1429 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1431 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1434 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1437 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1439 struct rtable *rt = (struct rtable *)dst;
1440 struct dst_entry *ret = dst;
1443 if (dst->obsolete > 0) {
1446 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1447 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1449 rt_genid(dev_net(dst->dev)));
1452 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1453 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1461 * 1. The first ip_rt_redirect_number redirects are sent
1462 * with exponential backoff, then we stop sending them at all,
1463 * assuming that the host ignores our redirects.
1464 * 2. If we did not see packets requiring redirects
1465 * during ip_rt_redirect_silence, we assume that the host
1466 * forgot redirected route and start to send redirects again.
1468 * This algorithm is much cheaper and more intelligent than dumb load limiting
1471 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1472 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1475 void ip_rt_send_redirect(struct sk_buff *skb)
1477 struct rtable *rt = skb_rtable(skb);
1478 struct in_device *in_dev;
1479 struct inet_peer *peer;
1483 in_dev = __in_dev_get_rcu(rt->dst.dev);
1484 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1488 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1492 rt_bind_peer(rt, rt->rt_dst, 1);
1495 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1499 /* No redirected packets during ip_rt_redirect_silence;
1500 * reset the algorithm.
1502 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1503 peer->rate_tokens = 0;
1505 /* Too many ignored redirects; do not send anything
1506 * set dst.rate_last to the last seen redirected packet.
1508 if (peer->rate_tokens >= ip_rt_redirect_number) {
1509 peer->rate_last = jiffies;
1513 /* Check for load limit; set rate_last to the latest sent
1516 if (peer->rate_tokens == 0 ||
1519 (ip_rt_redirect_load << peer->rate_tokens)))) {
1520 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1521 peer->rate_last = jiffies;
1522 ++peer->rate_tokens;
1523 #ifdef CONFIG_IP_ROUTE_VERBOSE
1525 peer->rate_tokens == ip_rt_redirect_number &&
1527 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1528 &ip_hdr(skb)->saddr, rt->rt_iif,
1529 &rt->rt_dst, &rt->rt_gateway);
1534 static int ip_error(struct sk_buff *skb)
1536 struct rtable *rt = skb_rtable(skb);
1537 struct inet_peer *peer;
1542 switch (rt->dst.error) {
1547 code = ICMP_HOST_UNREACH;
1550 code = ICMP_NET_UNREACH;
1551 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1552 IPSTATS_MIB_INNOROUTES);
1555 code = ICMP_PKT_FILTERED;
1560 rt_bind_peer(rt, rt->rt_dst, 1);
1566 peer->rate_tokens += now - peer->rate_last;
1567 if (peer->rate_tokens > ip_rt_error_burst)
1568 peer->rate_tokens = ip_rt_error_burst;
1569 peer->rate_last = now;
1570 if (peer->rate_tokens >= ip_rt_error_cost)
1571 peer->rate_tokens -= ip_rt_error_cost;
1576 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1578 out: kfree_skb(skb);
1583 * The last two values are not from the RFC but
1584 * are needed for AMPRnet AX.25 paths.
1587 static const unsigned short mtu_plateau[] =
1588 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1590 static inline unsigned short guess_mtu(unsigned short old_mtu)
1594 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1595 if (old_mtu > mtu_plateau[i])
1596 return mtu_plateau[i];
1600 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1601 unsigned short new_mtu,
1602 struct net_device *dev)
1604 unsigned short old_mtu = ntohs(iph->tot_len);
1605 unsigned short est_mtu = 0;
1606 struct inet_peer *peer;
1608 peer = inet_getpeer_v4(iph->daddr, 1);
1610 unsigned short mtu = new_mtu;
1612 if (new_mtu < 68 || new_mtu >= old_mtu) {
1613 /* BSD 4.2 derived systems incorrectly adjust
1614 * tot_len by the IP header length, and report
1615 * a zero MTU in the ICMP message.
1618 old_mtu >= 68 + (iph->ihl << 2))
1619 old_mtu -= iph->ihl << 2;
1620 mtu = guess_mtu(old_mtu);
1623 if (mtu < ip_rt_min_pmtu)
1624 mtu = ip_rt_min_pmtu;
1625 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1626 unsigned long pmtu_expires;
1628 pmtu_expires = jiffies + ip_rt_mtu_expires;
1633 peer->pmtu_learned = mtu;
1634 peer->pmtu_expires = pmtu_expires;
1635 atomic_inc(&__rt_peer_genid);
1640 return est_mtu ? : new_mtu;
1643 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1645 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1649 if (time_before(jiffies, expires)) {
1650 u32 orig_dst_mtu = dst_mtu(dst);
1651 if (peer->pmtu_learned < orig_dst_mtu) {
1652 if (!peer->pmtu_orig)
1653 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1654 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1656 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1657 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1660 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1662 struct rtable *rt = (struct rtable *) dst;
1663 struct inet_peer *peer;
1668 rt_bind_peer(rt, rt->rt_dst, 1);
1671 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1673 if (mtu < ip_rt_min_pmtu)
1674 mtu = ip_rt_min_pmtu;
1675 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1677 pmtu_expires = jiffies + ip_rt_mtu_expires;
1681 peer->pmtu_learned = mtu;
1682 peer->pmtu_expires = pmtu_expires;
1684 atomic_inc(&__rt_peer_genid);
1685 rt->rt_peer_genid = rt_peer_genid();
1687 check_peer_pmtu(dst, peer);
1692 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1694 struct rtable *rt = (struct rtable *) dst;
1696 if (rt_is_expired(rt))
1698 if (rt->rt_peer_genid != rt_peer_genid()) {
1699 struct inet_peer *peer;
1702 rt_bind_peer(rt, rt->rt_dst, 0);
1706 check_peer_pmtu(dst, peer);
1708 if (peer->redirect_genid != redirect_genid)
1709 peer->redirect_learned.a4 = 0;
1710 if (peer->redirect_learned.a4 &&
1711 peer->redirect_learned.a4 != rt->rt_gateway) {
1712 if (check_peer_redir(dst, peer))
1717 rt->rt_peer_genid = rt_peer_genid();
1722 static void ipv4_dst_destroy(struct dst_entry *dst)
1724 struct rtable *rt = (struct rtable *) dst;
1725 struct inet_peer *peer = rt->peer;
1728 fib_info_put(rt->fi);
1738 static void ipv4_link_failure(struct sk_buff *skb)
1742 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1744 rt = skb_rtable(skb);
1745 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1746 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1749 static int ip_rt_bug(struct sk_buff *skb)
1751 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1752 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1753 skb->dev ? skb->dev->name : "?");
1760 We do not cache source address of outgoing interface,
1761 because it is used only by IP RR, TS and SRR options,
1762 so that it out of fast path.
1764 BTW remember: "addr" is allowed to be not aligned
1768 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1772 if (rt_is_output_route(rt))
1773 src = ip_hdr(skb)->saddr;
1775 struct fib_result res;
1781 memset(&fl4, 0, sizeof(fl4));
1782 fl4.daddr = iph->daddr;
1783 fl4.saddr = iph->saddr;
1784 fl4.flowi4_tos = RT_TOS(iph->tos);
1785 fl4.flowi4_oif = rt->dst.dev->ifindex;
1786 fl4.flowi4_iif = skb->dev->ifindex;
1787 fl4.flowi4_mark = skb->mark;
1790 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1791 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1793 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1797 memcpy(addr, &src, 4);
1800 #ifdef CONFIG_IP_ROUTE_CLASSID
1801 static void set_class_tag(struct rtable *rt, u32 tag)
1803 if (!(rt->dst.tclassid & 0xFFFF))
1804 rt->dst.tclassid |= tag & 0xFFFF;
1805 if (!(rt->dst.tclassid & 0xFFFF0000))
1806 rt->dst.tclassid |= tag & 0xFFFF0000;
1810 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1812 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1815 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1817 if (advmss > 65535 - 40)
1818 advmss = 65535 - 40;
1823 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1825 const struct rtable *rt = (const struct rtable *) dst;
1826 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1828 if (mtu && rt_is_output_route(rt))
1831 mtu = dst->dev->mtu;
1833 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1835 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1839 if (mtu > IP_MAX_MTU)
1845 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1846 struct fib_info *fi)
1848 struct inet_peer *peer;
1851 /* If a peer entry exists for this destination, we must hook
1852 * it up in order to get at cached metrics.
1854 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1857 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1859 rt->rt_peer_genid = rt_peer_genid();
1860 if (inet_metrics_new(peer))
1861 memcpy(peer->metrics, fi->fib_metrics,
1862 sizeof(u32) * RTAX_MAX);
1863 dst_init_metrics(&rt->dst, peer->metrics, false);
1865 check_peer_pmtu(&rt->dst, peer);
1866 if (peer->redirect_genid != redirect_genid)
1867 peer->redirect_learned.a4 = 0;
1868 if (peer->redirect_learned.a4 &&
1869 peer->redirect_learned.a4 != rt->rt_gateway) {
1870 rt->rt_gateway = peer->redirect_learned.a4;
1871 rt->rt_flags |= RTCF_REDIRECTED;
1874 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1876 atomic_inc(&fi->fib_clntref);
1878 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1882 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1883 const struct fib_result *res,
1884 struct fib_info *fi, u16 type, u32 itag)
1886 struct dst_entry *dst = &rt->dst;
1889 if (FIB_RES_GW(*res) &&
1890 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1891 rt->rt_gateway = FIB_RES_GW(*res);
1892 rt_init_metrics(rt, fl4, fi);
1893 #ifdef CONFIG_IP_ROUTE_CLASSID
1894 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1898 if (dst_mtu(dst) > IP_MAX_MTU)
1899 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1900 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1901 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1903 #ifdef CONFIG_IP_ROUTE_CLASSID
1904 #ifdef CONFIG_IP_MULTIPLE_TABLES
1905 set_class_tag(rt, fib_rules_tclass(res));
1907 set_class_tag(rt, itag);
1911 static struct rtable *rt_dst_alloc(struct net_device *dev,
1912 bool nopolicy, bool noxfrm)
1914 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1916 (nopolicy ? DST_NOPOLICY : 0) |
1917 (noxfrm ? DST_NOXFRM : 0));
1920 /* called in rcu_read_lock() section */
1921 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1922 u8 tos, struct net_device *dev, int our)
1927 struct in_device *in_dev = __in_dev_get_rcu(dev);
1931 /* Primary sanity checks. */
1936 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1937 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1940 if (ipv4_is_zeronet(saddr)) {
1941 if (!ipv4_is_local_multicast(daddr))
1943 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1945 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1950 rth = rt_dst_alloc(init_net.loopback_dev,
1951 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1955 #ifdef CONFIG_IP_ROUTE_CLASSID
1956 rth->dst.tclassid = itag;
1958 rth->dst.output = ip_rt_bug;
1960 rth->rt_key_dst = daddr;
1961 rth->rt_key_src = saddr;
1962 rth->rt_genid = rt_genid(dev_net(dev));
1963 rth->rt_flags = RTCF_MULTICAST;
1964 rth->rt_type = RTN_MULTICAST;
1965 rth->rt_key_tos = tos;
1966 rth->rt_dst = daddr;
1967 rth->rt_src = saddr;
1968 rth->rt_route_iif = dev->ifindex;
1969 rth->rt_iif = dev->ifindex;
1971 rth->rt_mark = skb->mark;
1972 rth->rt_gateway = daddr;
1973 rth->rt_spec_dst= spec_dst;
1974 rth->rt_peer_genid = 0;
1978 rth->dst.input= ip_local_deliver;
1979 rth->rt_flags |= RTCF_LOCAL;
1982 #ifdef CONFIG_IP_MROUTE
1983 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1984 rth->dst.input = ip_mr_input;
1986 RT_CACHE_STAT_INC(in_slow_mc);
1988 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1989 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1990 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2001 static void ip_handle_martian_source(struct net_device *dev,
2002 struct in_device *in_dev,
2003 struct sk_buff *skb,
2007 RT_CACHE_STAT_INC(in_martian_src);
2008 #ifdef CONFIG_IP_ROUTE_VERBOSE
2009 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2011 * RFC1812 recommendation, if source is martian,
2012 * the only hint is MAC header.
2014 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2015 &daddr, &saddr, dev->name);
2016 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2018 const unsigned char *p = skb_mac_header(skb);
2019 printk(KERN_WARNING "ll header: ");
2020 for (i = 0; i < dev->hard_header_len; i++, p++) {
2022 if (i < (dev->hard_header_len - 1))
2031 /* called in rcu_read_lock() section */
2032 static int __mkroute_input(struct sk_buff *skb,
2033 const struct fib_result *res,
2034 struct in_device *in_dev,
2035 __be32 daddr, __be32 saddr, u32 tos,
2036 struct rtable **result)
2040 struct in_device *out_dev;
2041 unsigned int flags = 0;
2045 /* get a working reference to the output device */
2046 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2047 if (out_dev == NULL) {
2048 if (net_ratelimit())
2049 printk(KERN_CRIT "Bug in ip_route_input" \
2050 "_slow(). Please, report\n");
2055 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2056 in_dev->dev, &spec_dst, &itag);
2058 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2065 flags |= RTCF_DIRECTSRC;
2067 if (out_dev == in_dev && err &&
2068 (IN_DEV_SHARED_MEDIA(out_dev) ||
2069 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2070 flags |= RTCF_DOREDIRECT;
2072 if (skb->protocol != htons(ETH_P_IP)) {
2073 /* Not IP (i.e. ARP). Do not create route, if it is
2074 * invalid for proxy arp. DNAT routes are always valid.
2076 * Proxy arp feature have been extended to allow, ARP
2077 * replies back to the same interface, to support
2078 * Private VLAN switch technologies. See arp.c.
2080 if (out_dev == in_dev &&
2081 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2087 rth = rt_dst_alloc(out_dev->dev,
2088 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2089 IN_DEV_CONF_GET(out_dev, NOXFRM));
2095 rth->rt_key_dst = daddr;
2096 rth->rt_key_src = saddr;
2097 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2098 rth->rt_flags = flags;
2099 rth->rt_type = res->type;
2100 rth->rt_key_tos = tos;
2101 rth->rt_dst = daddr;
2102 rth->rt_src = saddr;
2103 rth->rt_route_iif = in_dev->dev->ifindex;
2104 rth->rt_iif = in_dev->dev->ifindex;
2106 rth->rt_mark = skb->mark;
2107 rth->rt_gateway = daddr;
2108 rth->rt_spec_dst= spec_dst;
2109 rth->rt_peer_genid = 0;
2113 rth->dst.input = ip_forward;
2114 rth->dst.output = ip_output;
2116 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2124 static int ip_mkroute_input(struct sk_buff *skb,
2125 struct fib_result *res,
2126 const struct flowi4 *fl4,
2127 struct in_device *in_dev,
2128 __be32 daddr, __be32 saddr, u32 tos)
2130 struct rtable* rth = NULL;
2134 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2135 if (res->fi && res->fi->fib_nhs > 1)
2136 fib_select_multipath(res);
2139 /* create a routing cache entry */
2140 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2144 /* put it into the cache */
2145 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2146 rt_genid(dev_net(rth->dst.dev)));
2147 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2149 return PTR_ERR(rth);
2154 * NOTE. We drop all the packets that has local source
2155 * addresses, because every properly looped back packet
2156 * must have correct destination already attached by output routine.
2158 * Such approach solves two big problems:
2159 * 1. Not simplex devices are handled properly.
2160 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2161 * called with rcu_read_lock()
2164 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2165 u8 tos, struct net_device *dev)
2167 struct fib_result res;
2168 struct in_device *in_dev = __in_dev_get_rcu(dev);
2172 struct rtable * rth;
2176 struct net * net = dev_net(dev);
2178 /* IP on this device is disabled. */
2183 /* Check for the most weird martians, which can be not detected
2187 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2188 ipv4_is_loopback(saddr))
2189 goto martian_source;
2191 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2194 /* Accept zero addresses only to limited broadcast;
2195 * I even do not know to fix it or not. Waiting for complains :-)
2197 if (ipv4_is_zeronet(saddr))
2198 goto martian_source;
2200 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2201 goto martian_destination;
2204 * Now we are ready to route packet.
2207 fl4.flowi4_iif = dev->ifindex;
2208 fl4.flowi4_mark = skb->mark;
2209 fl4.flowi4_tos = tos;
2210 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2213 err = fib_lookup(net, &fl4, &res);
2215 if (!IN_DEV_FORWARD(in_dev))
2220 RT_CACHE_STAT_INC(in_slow_tot);
2222 if (res.type == RTN_BROADCAST)
2225 if (res.type == RTN_LOCAL) {
2226 err = fib_validate_source(skb, saddr, daddr, tos,
2227 net->loopback_dev->ifindex,
2228 dev, &spec_dst, &itag);
2230 goto martian_source_keep_err;
2232 flags |= RTCF_DIRECTSRC;
2237 if (!IN_DEV_FORWARD(in_dev))
2239 if (res.type != RTN_UNICAST)
2240 goto martian_destination;
2242 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2246 if (skb->protocol != htons(ETH_P_IP))
2249 if (ipv4_is_zeronet(saddr))
2250 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2252 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2255 goto martian_source_keep_err;
2257 flags |= RTCF_DIRECTSRC;
2259 flags |= RTCF_BROADCAST;
2260 res.type = RTN_BROADCAST;
2261 RT_CACHE_STAT_INC(in_brd);
2264 rth = rt_dst_alloc(net->loopback_dev,
2265 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2269 rth->dst.input= ip_local_deliver;
2270 rth->dst.output= ip_rt_bug;
2271 #ifdef CONFIG_IP_ROUTE_CLASSID
2272 rth->dst.tclassid = itag;
2275 rth->rt_key_dst = daddr;
2276 rth->rt_key_src = saddr;
2277 rth->rt_genid = rt_genid(net);
2278 rth->rt_flags = flags|RTCF_LOCAL;
2279 rth->rt_type = res.type;
2280 rth->rt_key_tos = tos;
2281 rth->rt_dst = daddr;
2282 rth->rt_src = saddr;
2283 #ifdef CONFIG_IP_ROUTE_CLASSID
2284 rth->dst.tclassid = itag;
2286 rth->rt_route_iif = dev->ifindex;
2287 rth->rt_iif = dev->ifindex;
2289 rth->rt_mark = skb->mark;
2290 rth->rt_gateway = daddr;
2291 rth->rt_spec_dst= spec_dst;
2292 rth->rt_peer_genid = 0;
2295 if (res.type == RTN_UNREACHABLE) {
2296 rth->dst.input= ip_error;
2297 rth->dst.error= -err;
2298 rth->rt_flags &= ~RTCF_LOCAL;
2300 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2301 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2308 RT_CACHE_STAT_INC(in_no_route);
2309 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2310 res.type = RTN_UNREACHABLE;
2316 * Do not cache martian addresses: they should be logged (RFC1812)
2318 martian_destination:
2319 RT_CACHE_STAT_INC(in_martian_dst);
2320 #ifdef CONFIG_IP_ROUTE_VERBOSE
2321 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2322 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2323 &daddr, &saddr, dev->name);
2327 err = -EHOSTUNREACH;
2340 martian_source_keep_err:
2341 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2345 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2346 u8 tos, struct net_device *dev, bool noref)
2348 struct rtable * rth;
2350 int iif = dev->ifindex;
2358 if (!rt_caching(net))
2361 tos &= IPTOS_RT_MASK;
2362 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2364 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2365 rth = rcu_dereference(rth->dst.rt_next)) {
2366 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2367 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2368 (rth->rt_route_iif ^ iif) |
2369 (rth->rt_key_tos ^ tos)) == 0 &&
2370 rth->rt_mark == skb->mark &&
2371 net_eq(dev_net(rth->dst.dev), net) &&
2372 !rt_is_expired(rth)) {
2374 dst_use_noref(&rth->dst, jiffies);
2375 skb_dst_set_noref(skb, &rth->dst);
2377 dst_use(&rth->dst, jiffies);
2378 skb_dst_set(skb, &rth->dst);
2380 RT_CACHE_STAT_INC(in_hit);
2384 RT_CACHE_STAT_INC(in_hlist_search);
2388 /* Multicast recognition logic is moved from route cache to here.
2389 The problem was that too many Ethernet cards have broken/missing
2390 hardware multicast filters :-( As result the host on multicasting
2391 network acquires a lot of useless route cache entries, sort of
2392 SDR messages from all the world. Now we try to get rid of them.
2393 Really, provided software IP multicast filter is organized
2394 reasonably (at least, hashed), it does not result in a slowdown
2395 comparing with route cache reject entries.
2396 Note, that multicast routers are not affected, because
2397 route cache entry is created eventually.
2399 if (ipv4_is_multicast(daddr)) {
2400 struct in_device *in_dev = __in_dev_get_rcu(dev);
2403 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2404 ip_hdr(skb)->protocol);
2406 #ifdef CONFIG_IP_MROUTE
2408 (!ipv4_is_local_multicast(daddr) &&
2409 IN_DEV_MFORWARD(in_dev))
2412 int res = ip_route_input_mc(skb, daddr, saddr,
2421 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2425 EXPORT_SYMBOL(ip_route_input_common);
2427 /* called with rcu_read_lock() */
2428 static struct rtable *__mkroute_output(const struct fib_result *res,
2429 const struct flowi4 *fl4,
2430 __be32 orig_daddr, __be32 orig_saddr,
2431 int orig_oif, struct net_device *dev_out,
2434 struct fib_info *fi = res->fi;
2435 u32 tos = RT_FL_TOS(fl4);
2436 struct in_device *in_dev;
2437 u16 type = res->type;
2440 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2441 return ERR_PTR(-EINVAL);
2443 if (ipv4_is_lbcast(fl4->daddr))
2444 type = RTN_BROADCAST;
2445 else if (ipv4_is_multicast(fl4->daddr))
2446 type = RTN_MULTICAST;
2447 else if (ipv4_is_zeronet(fl4->daddr))
2448 return ERR_PTR(-EINVAL);
2450 if (dev_out->flags & IFF_LOOPBACK)
2451 flags |= RTCF_LOCAL;
2453 in_dev = __in_dev_get_rcu(dev_out);
2455 return ERR_PTR(-EINVAL);
2457 if (type == RTN_BROADCAST) {
2458 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2460 } else if (type == RTN_MULTICAST) {
2461 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2462 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2464 flags &= ~RTCF_LOCAL;
2465 /* If multicast route do not exist use
2466 * default one, but do not gateway in this case.
2469 if (fi && res->prefixlen < 4)
2473 rth = rt_dst_alloc(dev_out,
2474 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2475 IN_DEV_CONF_GET(in_dev, NOXFRM));
2477 return ERR_PTR(-ENOBUFS);
2479 rth->dst.output = ip_output;
2481 rth->rt_key_dst = orig_daddr;
2482 rth->rt_key_src = orig_saddr;
2483 rth->rt_genid = rt_genid(dev_net(dev_out));
2484 rth->rt_flags = flags;
2485 rth->rt_type = type;
2486 rth->rt_key_tos = tos;
2487 rth->rt_dst = fl4->daddr;
2488 rth->rt_src = fl4->saddr;
2489 rth->rt_route_iif = 0;
2490 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2491 rth->rt_oif = orig_oif;
2492 rth->rt_mark = fl4->flowi4_mark;
2493 rth->rt_gateway = fl4->daddr;
2494 rth->rt_spec_dst= fl4->saddr;
2495 rth->rt_peer_genid = 0;
2499 RT_CACHE_STAT_INC(out_slow_tot);
2501 if (flags & RTCF_LOCAL) {
2502 rth->dst.input = ip_local_deliver;
2503 rth->rt_spec_dst = fl4->daddr;
2505 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2506 rth->rt_spec_dst = fl4->saddr;
2507 if (flags & RTCF_LOCAL &&
2508 !(dev_out->flags & IFF_LOOPBACK)) {
2509 rth->dst.output = ip_mc_output;
2510 RT_CACHE_STAT_INC(out_slow_mc);
2512 #ifdef CONFIG_IP_MROUTE
2513 if (type == RTN_MULTICAST) {
2514 if (IN_DEV_MFORWARD(in_dev) &&
2515 !ipv4_is_local_multicast(fl4->daddr)) {
2516 rth->dst.input = ip_mr_input;
2517 rth->dst.output = ip_mc_output;
2523 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2529 * Major route resolver routine.
2530 * called with rcu_read_lock();
2533 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2535 struct net_device *dev_out = NULL;
2536 u32 tos = RT_FL_TOS(fl4);
2537 unsigned int flags = 0;
2538 struct fib_result res;
2545 #ifdef CONFIG_IP_MULTIPLE_TABLES
2549 orig_daddr = fl4->daddr;
2550 orig_saddr = fl4->saddr;
2551 orig_oif = fl4->flowi4_oif;
2553 fl4->flowi4_iif = net->loopback_dev->ifindex;
2554 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2555 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2556 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2560 rth = ERR_PTR(-EINVAL);
2561 if (ipv4_is_multicast(fl4->saddr) ||
2562 ipv4_is_lbcast(fl4->saddr) ||
2563 ipv4_is_zeronet(fl4->saddr))
2566 /* I removed check for oif == dev_out->oif here.
2567 It was wrong for two reasons:
2568 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2569 is assigned to multiple interfaces.
2570 2. Moreover, we are allowed to send packets with saddr
2571 of another iface. --ANK
2574 if (fl4->flowi4_oif == 0 &&
2575 (ipv4_is_multicast(fl4->daddr) ||
2576 ipv4_is_lbcast(fl4->daddr))) {
2577 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2578 dev_out = __ip_dev_find(net, fl4->saddr, false);
2579 if (dev_out == NULL)
2582 /* Special hack: user can direct multicasts
2583 and limited broadcast via necessary interface
2584 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2585 This hack is not just for fun, it allows
2586 vic,vat and friends to work.
2587 They bind socket to loopback, set ttl to zero
2588 and expect that it will work.
2589 From the viewpoint of routing cache they are broken,
2590 because we are not allowed to build multicast path
2591 with loopback source addr (look, routing cache
2592 cannot know, that ttl is zero, so that packet
2593 will not leave this host and route is valid).
2594 Luckily, this hack is good workaround.
2597 fl4->flowi4_oif = dev_out->ifindex;
2601 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2602 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2603 if (!__ip_dev_find(net, fl4->saddr, false))
2609 if (fl4->flowi4_oif) {
2610 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2611 rth = ERR_PTR(-ENODEV);
2612 if (dev_out == NULL)
2615 /* RACE: Check return value of inet_select_addr instead. */
2616 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2617 rth = ERR_PTR(-ENETUNREACH);
2620 if (ipv4_is_local_multicast(fl4->daddr) ||
2621 ipv4_is_lbcast(fl4->daddr)) {
2623 fl4->saddr = inet_select_addr(dev_out, 0,
2628 if (ipv4_is_multicast(fl4->daddr))
2629 fl4->saddr = inet_select_addr(dev_out, 0,
2631 else if (!fl4->daddr)
2632 fl4->saddr = inet_select_addr(dev_out, 0,
2638 fl4->daddr = fl4->saddr;
2640 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2641 dev_out = net->loopback_dev;
2642 fl4->flowi4_oif = net->loopback_dev->ifindex;
2643 res.type = RTN_LOCAL;
2644 flags |= RTCF_LOCAL;
2648 if (fib_lookup(net, fl4, &res)) {
2650 if (fl4->flowi4_oif) {
2651 /* Apparently, routing tables are wrong. Assume,
2652 that the destination is on link.
2655 Because we are allowed to send to iface
2656 even if it has NO routes and NO assigned
2657 addresses. When oif is specified, routing
2658 tables are looked up with only one purpose:
2659 to catch if destination is gatewayed, rather than
2660 direct. Moreover, if MSG_DONTROUTE is set,
2661 we send packet, ignoring both routing tables
2662 and ifaddr state. --ANK
2665 We could make it even if oif is unknown,
2666 likely IPv6, but we do not.
2669 if (fl4->saddr == 0)
2670 fl4->saddr = inet_select_addr(dev_out, 0,
2672 res.type = RTN_UNICAST;
2675 rth = ERR_PTR(-ENETUNREACH);
2679 if (res.type == RTN_LOCAL) {
2681 if (res.fi->fib_prefsrc)
2682 fl4->saddr = res.fi->fib_prefsrc;
2684 fl4->saddr = fl4->daddr;
2686 dev_out = net->loopback_dev;
2687 fl4->flowi4_oif = dev_out->ifindex;
2689 flags |= RTCF_LOCAL;
2693 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2694 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2695 fib_select_multipath(&res);
2698 if (!res.prefixlen &&
2699 res.table->tb_num_default > 1 &&
2700 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2701 fib_select_default(&res);
2704 fl4->saddr = FIB_RES_PREFSRC(net, res);
2706 dev_out = FIB_RES_DEV(res);
2707 fl4->flowi4_oif = dev_out->ifindex;
2711 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2716 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2717 rt_genid(dev_net(dev_out)));
2718 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2726 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2731 if (!rt_caching(net))
2734 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2737 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2738 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2739 if (rth->rt_key_dst == flp4->daddr &&
2740 rth->rt_key_src == flp4->saddr &&
2741 rt_is_output_route(rth) &&
2742 rth->rt_oif == flp4->flowi4_oif &&
2743 rth->rt_mark == flp4->flowi4_mark &&
2744 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2745 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2746 net_eq(dev_net(rth->dst.dev), net) &&
2747 !rt_is_expired(rth)) {
2748 dst_use(&rth->dst, jiffies);
2749 RT_CACHE_STAT_INC(out_hit);
2750 rcu_read_unlock_bh();
2752 flp4->saddr = rth->rt_src;
2754 flp4->daddr = rth->rt_dst;
2757 RT_CACHE_STAT_INC(out_hlist_search);
2759 rcu_read_unlock_bh();
2762 return ip_route_output_slow(net, flp4);
2764 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2766 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2771 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2773 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2775 return mtu ? : dst->dev->mtu;
2778 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2782 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2788 static struct dst_ops ipv4_dst_blackhole_ops = {
2790 .protocol = cpu_to_be16(ETH_P_IP),
2791 .destroy = ipv4_dst_destroy,
2792 .check = ipv4_blackhole_dst_check,
2793 .mtu = ipv4_blackhole_mtu,
2794 .default_advmss = ipv4_default_advmss,
2795 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2796 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2797 .neigh_lookup = ipv4_neigh_lookup,
2800 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2802 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2803 struct rtable *ort = (struct rtable *) dst_orig;
2806 struct dst_entry *new = &rt->dst;
2809 new->input = dst_discard;
2810 new->output = dst_discard;
2811 dst_copy_metrics(new, &ort->dst);
2813 new->dev = ort->dst.dev;
2817 rt->rt_key_dst = ort->rt_key_dst;
2818 rt->rt_key_src = ort->rt_key_src;
2819 rt->rt_key_tos = ort->rt_key_tos;
2820 rt->rt_route_iif = ort->rt_route_iif;
2821 rt->rt_iif = ort->rt_iif;
2822 rt->rt_oif = ort->rt_oif;
2823 rt->rt_mark = ort->rt_mark;
2825 rt->rt_genid = rt_genid(net);
2826 rt->rt_flags = ort->rt_flags;
2827 rt->rt_type = ort->rt_type;
2828 rt->rt_dst = ort->rt_dst;
2829 rt->rt_src = ort->rt_src;
2830 rt->rt_gateway = ort->rt_gateway;
2831 rt->rt_spec_dst = ort->rt_spec_dst;
2832 rt->peer = ort->peer;
2834 atomic_inc(&rt->peer->refcnt);
2837 atomic_inc(&rt->fi->fib_clntref);
2842 dst_release(dst_orig);
2844 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2847 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2850 struct rtable *rt = __ip_route_output_key(net, flp4);
2855 if (flp4->flowi4_proto)
2856 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2857 flowi4_to_flowi(flp4),
2862 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2864 static int rt_fill_info(struct net *net,
2865 struct sk_buff *skb, u32 pid, u32 seq, int event,
2866 int nowait, unsigned int flags)
2868 struct rtable *rt = skb_rtable(skb);
2870 struct nlmsghdr *nlh;
2871 unsigned long expires = 0;
2872 const struct inet_peer *peer = rt->peer;
2873 u32 id = 0, ts = 0, tsage = 0, error;
2875 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2879 r = nlmsg_data(nlh);
2880 r->rtm_family = AF_INET;
2881 r->rtm_dst_len = 32;
2883 r->rtm_tos = rt->rt_key_tos;
2884 r->rtm_table = RT_TABLE_MAIN;
2885 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2886 r->rtm_type = rt->rt_type;
2887 r->rtm_scope = RT_SCOPE_UNIVERSE;
2888 r->rtm_protocol = RTPROT_UNSPEC;
2889 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2890 if (rt->rt_flags & RTCF_NOTIFY)
2891 r->rtm_flags |= RTM_F_NOTIFY;
2893 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2895 if (rt->rt_key_src) {
2896 r->rtm_src_len = 32;
2897 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2900 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2901 #ifdef CONFIG_IP_ROUTE_CLASSID
2902 if (rt->dst.tclassid)
2903 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2905 if (rt_is_input_route(rt))
2906 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2907 else if (rt->rt_src != rt->rt_key_src)
2908 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2910 if (rt->rt_dst != rt->rt_gateway)
2911 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2913 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2914 goto nla_put_failure;
2917 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2919 error = rt->dst.error;
2921 inet_peer_refcheck(rt->peer);
2922 id = atomic_read(&peer->ip_id_count) & 0xffff;
2923 if (peer->tcp_ts_stamp) {
2925 tsage = get_seconds() - peer->tcp_ts_stamp;
2927 expires = ACCESS_ONCE(peer->pmtu_expires);
2929 if (time_before(jiffies, expires))
2936 if (rt_is_input_route(rt)) {
2937 #ifdef CONFIG_IP_MROUTE
2938 __be32 dst = rt->rt_dst;
2940 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2941 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2942 int err = ipmr_get_route(net, skb,
2943 rt->rt_src, rt->rt_dst,
2949 goto nla_put_failure;
2951 if (err == -EMSGSIZE)
2952 goto nla_put_failure;
2958 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2961 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2962 expires, error) < 0)
2963 goto nla_put_failure;
2965 return nlmsg_end(skb, nlh);
2968 nlmsg_cancel(skb, nlh);
2972 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2974 struct net *net = sock_net(in_skb->sk);
2976 struct nlattr *tb[RTA_MAX+1];
2977 struct rtable *rt = NULL;
2983 struct sk_buff *skb;
2985 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2989 rtm = nlmsg_data(nlh);
2991 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2997 /* Reserve room for dummy headers, this skb can pass
2998 through good chunk of routing engine.
3000 skb_reset_mac_header(skb);
3001 skb_reset_network_header(skb);
3003 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3004 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3005 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3007 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3008 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3009 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3010 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3013 struct net_device *dev;
3015 dev = __dev_get_by_index(net, iif);
3021 skb->protocol = htons(ETH_P_IP);
3025 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3028 rt = skb_rtable(skb);
3029 if (err == 0 && rt->dst.error)
3030 err = -rt->dst.error;
3032 struct flowi4 fl4 = {
3035 .flowi4_tos = rtm->rtm_tos,
3036 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3037 .flowi4_mark = mark,
3039 rt = ip_route_output_key(net, &fl4);
3049 skb_dst_set(skb, &rt->dst);
3050 if (rtm->rtm_flags & RTM_F_NOTIFY)
3051 rt->rt_flags |= RTCF_NOTIFY;
3053 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3054 RTM_NEWROUTE, 0, 0);
3058 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3067 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3074 net = sock_net(skb->sk);
3079 s_idx = idx = cb->args[1];
3080 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3081 if (!rt_hash_table[h].chain)
3084 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3085 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3086 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3088 if (rt_is_expired(rt))
3090 skb_dst_set_noref(skb, &rt->dst);
3091 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3092 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3093 1, NLM_F_MULTI) <= 0) {
3095 rcu_read_unlock_bh();
3100 rcu_read_unlock_bh();
3109 void ip_rt_multicast_event(struct in_device *in_dev)
3111 rt_cache_flush(dev_net(in_dev->dev), 0);
3114 #ifdef CONFIG_SYSCTL
3115 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3116 void __user *buffer,
3117 size_t *lenp, loff_t *ppos)
3124 memcpy(&ctl, __ctl, sizeof(ctl));
3125 ctl.data = &flush_delay;
3126 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3128 net = (struct net *)__ctl->extra1;
3129 rt_cache_flush(net, flush_delay);
3136 static ctl_table ipv4_route_table[] = {
3138 .procname = "gc_thresh",
3139 .data = &ipv4_dst_ops.gc_thresh,
3140 .maxlen = sizeof(int),
3142 .proc_handler = proc_dointvec,
3145 .procname = "max_size",
3146 .data = &ip_rt_max_size,
3147 .maxlen = sizeof(int),
3149 .proc_handler = proc_dointvec,
3152 /* Deprecated. Use gc_min_interval_ms */
3154 .procname = "gc_min_interval",
3155 .data = &ip_rt_gc_min_interval,
3156 .maxlen = sizeof(int),
3158 .proc_handler = proc_dointvec_jiffies,
3161 .procname = "gc_min_interval_ms",
3162 .data = &ip_rt_gc_min_interval,
3163 .maxlen = sizeof(int),
3165 .proc_handler = proc_dointvec_ms_jiffies,
3168 .procname = "gc_timeout",
3169 .data = &ip_rt_gc_timeout,
3170 .maxlen = sizeof(int),
3172 .proc_handler = proc_dointvec_jiffies,
3175 .procname = "redirect_load",
3176 .data = &ip_rt_redirect_load,
3177 .maxlen = sizeof(int),
3179 .proc_handler = proc_dointvec,
3182 .procname = "redirect_number",
3183 .data = &ip_rt_redirect_number,
3184 .maxlen = sizeof(int),
3186 .proc_handler = proc_dointvec,
3189 .procname = "redirect_silence",
3190 .data = &ip_rt_redirect_silence,
3191 .maxlen = sizeof(int),
3193 .proc_handler = proc_dointvec,
3196 .procname = "error_cost",
3197 .data = &ip_rt_error_cost,
3198 .maxlen = sizeof(int),
3200 .proc_handler = proc_dointvec,
3203 .procname = "error_burst",
3204 .data = &ip_rt_error_burst,
3205 .maxlen = sizeof(int),
3207 .proc_handler = proc_dointvec,
3210 .procname = "gc_elasticity",
3211 .data = &ip_rt_gc_elasticity,
3212 .maxlen = sizeof(int),
3214 .proc_handler = proc_dointvec,
3217 .procname = "mtu_expires",
3218 .data = &ip_rt_mtu_expires,
3219 .maxlen = sizeof(int),
3221 .proc_handler = proc_dointvec_jiffies,
3224 .procname = "min_pmtu",
3225 .data = &ip_rt_min_pmtu,
3226 .maxlen = sizeof(int),
3228 .proc_handler = proc_dointvec,
3231 .procname = "min_adv_mss",
3232 .data = &ip_rt_min_advmss,
3233 .maxlen = sizeof(int),
3235 .proc_handler = proc_dointvec,
3240 static struct ctl_table empty[1];
3242 static struct ctl_table ipv4_skeleton[] =
3244 { .procname = "route",
3245 .mode = 0555, .child = ipv4_route_table},
3246 { .procname = "neigh",
3247 .mode = 0555, .child = empty},
3251 static __net_initdata struct ctl_path ipv4_path[] = {
3252 { .procname = "net", },
3253 { .procname = "ipv4", },
3257 static struct ctl_table ipv4_route_flush_table[] = {
3259 .procname = "flush",
3260 .maxlen = sizeof(int),
3262 .proc_handler = ipv4_sysctl_rtcache_flush,
3267 static __net_initdata struct ctl_path ipv4_route_path[] = {
3268 { .procname = "net", },
3269 { .procname = "ipv4", },
3270 { .procname = "route", },
3274 static __net_init int sysctl_route_net_init(struct net *net)
3276 struct ctl_table *tbl;
3278 tbl = ipv4_route_flush_table;
3279 if (!net_eq(net, &init_net)) {
3280 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3284 tbl[0].extra1 = net;
3286 net->ipv4.route_hdr =
3287 register_net_sysctl_table(net, ipv4_route_path, tbl);
3288 if (net->ipv4.route_hdr == NULL)
3293 if (tbl != ipv4_route_flush_table)
3299 static __net_exit void sysctl_route_net_exit(struct net *net)
3301 struct ctl_table *tbl;
3303 tbl = net->ipv4.route_hdr->ctl_table_arg;
3304 unregister_net_sysctl_table(net->ipv4.route_hdr);
3305 BUG_ON(tbl == ipv4_route_flush_table);
3309 static __net_initdata struct pernet_operations sysctl_route_ops = {
3310 .init = sysctl_route_net_init,
3311 .exit = sysctl_route_net_exit,
3315 static __net_init int rt_genid_init(struct net *net)
3317 get_random_bytes(&net->ipv4.rt_genid,
3318 sizeof(net->ipv4.rt_genid));
3319 get_random_bytes(&net->ipv4.dev_addr_genid,
3320 sizeof(net->ipv4.dev_addr_genid));
3324 static __net_initdata struct pernet_operations rt_genid_ops = {
3325 .init = rt_genid_init,
3329 #ifdef CONFIG_IP_ROUTE_CLASSID
3330 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3331 #endif /* CONFIG_IP_ROUTE_CLASSID */
3333 static __initdata unsigned long rhash_entries;
3334 static int __init set_rhash_entries(char *str)
3338 rhash_entries = simple_strtoul(str, &str, 0);
3341 __setup("rhash_entries=", set_rhash_entries);
3343 int __init ip_rt_init(void)
3347 #ifdef CONFIG_IP_ROUTE_CLASSID
3348 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3350 panic("IP: failed to allocate ip_rt_acct\n");
3353 ipv4_dst_ops.kmem_cachep =
3354 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3355 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3357 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3359 if (dst_entries_init(&ipv4_dst_ops) < 0)
3360 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3362 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3363 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3365 rt_hash_table = (struct rt_hash_bucket *)
3366 alloc_large_system_hash("IP route cache",
3367 sizeof(struct rt_hash_bucket),
3369 (totalram_pages >= 128 * 1024) ?
3374 rhash_entries ? 0 : 512 * 1024);
3375 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3376 rt_hash_lock_init();
3378 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3379 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3384 if (ip_rt_proc_init())
3385 printk(KERN_ERR "Unable to create route proc files\n");
3388 xfrm4_init(ip_rt_max_size);
3390 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3392 #ifdef CONFIG_SYSCTL
3393 register_pernet_subsys(&sysctl_route_ops);
3395 register_pernet_subsys(&rt_genid_ops);
3399 #ifdef CONFIG_SYSCTL
3401 * We really need to sanitize the damn ipv4 init order, then all
3402 * this nonsense will go away.
3404 void __init ip_static_sysctl_init(void)
3406 register_sysctl_paths(ipv4_path, ipv4_skeleton);