net: Use VRF device index for lookups on RX
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / fib_frontend.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: FIB frontend.
7  *
8  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *              This program is free software; you can redistribute it and/or
11  *              modify it under the terms of the GNU General Public License
12  *              as published by the Free Software Foundation; either version
13  *              2 of the License, or (at your option) any later version.
14  */
15
16 #include <linux/module.h>
17 #include <asm/uaccess.h>
18 #include <linux/bitops.h>
19 #include <linux/capability.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/in.h>
28 #include <linux/inet.h>
29 #include <linux/inetdevice.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_addr.h>
32 #include <linux/if_arp.h>
33 #include <linux/skbuff.h>
34 #include <linux/cache.h>
35 #include <linux/init.h>
36 #include <linux/list.h>
37 #include <linux/slab.h>
38
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/arp.h>
45 #include <net/ip_fib.h>
46 #include <net/rtnetlink.h>
47 #include <net/xfrm.h>
48 #include <net/vrf.h>
49
50 #ifndef CONFIG_IP_MULTIPLE_TABLES
51
52 static int __net_init fib4_rules_init(struct net *net)
53 {
54         struct fib_table *local_table, *main_table;
55
56         main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
57         if (!main_table)
58                 return -ENOMEM;
59
60         local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
61         if (!local_table)
62                 goto fail;
63
64         hlist_add_head_rcu(&local_table->tb_hlist,
65                                 &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
66         hlist_add_head_rcu(&main_table->tb_hlist,
67                                 &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
68         return 0;
69
70 fail:
71         fib_free_table(main_table);
72         return -ENOMEM;
73 }
74 #else
75
76 struct fib_table *fib_new_table(struct net *net, u32 id)
77 {
78         struct fib_table *tb, *alias = NULL;
79         unsigned int h;
80
81         if (id == 0)
82                 id = RT_TABLE_MAIN;
83         tb = fib_get_table(net, id);
84         if (tb)
85                 return tb;
86
87         if (id == RT_TABLE_LOCAL)
88                 alias = fib_new_table(net, RT_TABLE_MAIN);
89
90         tb = fib_trie_table(id, alias);
91         if (!tb)
92                 return NULL;
93
94         switch (id) {
95         case RT_TABLE_LOCAL:
96                 rcu_assign_pointer(net->ipv4.fib_local, tb);
97                 break;
98         case RT_TABLE_MAIN:
99                 rcu_assign_pointer(net->ipv4.fib_main, tb);
100                 break;
101         case RT_TABLE_DEFAULT:
102                 rcu_assign_pointer(net->ipv4.fib_default, tb);
103                 break;
104         default:
105                 break;
106         }
107
108         h = id & (FIB_TABLE_HASHSZ - 1);
109         hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
110         return tb;
111 }
112
113 /* caller must hold either rtnl or rcu read lock */
114 struct fib_table *fib_get_table(struct net *net, u32 id)
115 {
116         struct fib_table *tb;
117         struct hlist_head *head;
118         unsigned int h;
119
120         if (id == 0)
121                 id = RT_TABLE_MAIN;
122         h = id & (FIB_TABLE_HASHSZ - 1);
123
124         head = &net->ipv4.fib_table_hash[h];
125         hlist_for_each_entry_rcu(tb, head, tb_hlist) {
126                 if (tb->tb_id == id)
127                         return tb;
128         }
129         return NULL;
130 }
131 #endif /* CONFIG_IP_MULTIPLE_TABLES */
132
133 static void fib_replace_table(struct net *net, struct fib_table *old,
134                               struct fib_table *new)
135 {
136 #ifdef CONFIG_IP_MULTIPLE_TABLES
137         switch (new->tb_id) {
138         case RT_TABLE_LOCAL:
139                 rcu_assign_pointer(net->ipv4.fib_local, new);
140                 break;
141         case RT_TABLE_MAIN:
142                 rcu_assign_pointer(net->ipv4.fib_main, new);
143                 break;
144         case RT_TABLE_DEFAULT:
145                 rcu_assign_pointer(net->ipv4.fib_default, new);
146                 break;
147         default:
148                 break;
149         }
150
151 #endif
152         /* replace the old table in the hlist */
153         hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
154 }
155
156 int fib_unmerge(struct net *net)
157 {
158         struct fib_table *old, *new;
159
160         /* attempt to fetch local table if it has been allocated */
161         old = fib_get_table(net, RT_TABLE_LOCAL);
162         if (!old)
163                 return 0;
164
165         new = fib_trie_unmerge(old);
166         if (!new)
167                 return -ENOMEM;
168
169         /* replace merged table with clean table */
170         if (new != old) {
171                 fib_replace_table(net, old, new);
172                 fib_free_table(old);
173         }
174
175         return 0;
176 }
177
178 static void fib_flush(struct net *net)
179 {
180         int flushed = 0;
181         unsigned int h;
182
183         for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
184                 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
185                 struct hlist_node *tmp;
186                 struct fib_table *tb;
187
188                 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
189                         flushed += fib_table_flush(tb);
190         }
191
192         if (flushed)
193                 rt_cache_flush(net);
194 }
195
196 void fib_flush_external(struct net *net)
197 {
198         struct fib_table *tb;
199         struct hlist_head *head;
200         unsigned int h;
201
202         for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
203                 head = &net->ipv4.fib_table_hash[h];
204                 hlist_for_each_entry(tb, head, tb_hlist)
205                         fib_table_flush_external(tb);
206         }
207 }
208
209 /*
210  * Find address type as if only "dev" was present in the system. If
211  * on_dev is NULL then all interfaces are taken into consideration.
212  */
213 static inline unsigned int __inet_dev_addr_type(struct net *net,
214                                                 const struct net_device *dev,
215                                                 __be32 addr)
216 {
217         struct flowi4           fl4 = { .daddr = addr };
218         struct fib_result       res;
219         unsigned int ret = RTN_BROADCAST;
220         struct fib_table *local_table;
221
222         if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
223                 return RTN_BROADCAST;
224         if (ipv4_is_multicast(addr))
225                 return RTN_MULTICAST;
226
227         rcu_read_lock();
228
229         local_table = fib_get_table(net, RT_TABLE_LOCAL);
230         if (local_table) {
231                 ret = RTN_UNICAST;
232                 if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
233                         if (!dev || dev == res.fi->fib_dev)
234                                 ret = res.type;
235                 }
236         }
237
238         rcu_read_unlock();
239         return ret;
240 }
241
242 unsigned int inet_addr_type(struct net *net, __be32 addr)
243 {
244         return __inet_dev_addr_type(net, NULL, addr);
245 }
246 EXPORT_SYMBOL(inet_addr_type);
247
248 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
249                                 __be32 addr)
250 {
251         return __inet_dev_addr_type(net, dev, addr);
252 }
253 EXPORT_SYMBOL(inet_dev_addr_type);
254
255 __be32 fib_compute_spec_dst(struct sk_buff *skb)
256 {
257         struct net_device *dev = skb->dev;
258         struct in_device *in_dev;
259         struct fib_result res;
260         struct rtable *rt;
261         struct flowi4 fl4;
262         struct net *net;
263         int scope;
264
265         rt = skb_rtable(skb);
266         if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
267             RTCF_LOCAL)
268                 return ip_hdr(skb)->daddr;
269
270         in_dev = __in_dev_get_rcu(dev);
271         BUG_ON(!in_dev);
272
273         net = dev_net(dev);
274
275         scope = RT_SCOPE_UNIVERSE;
276         if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
277                 fl4.flowi4_oif = 0;
278                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
279                 fl4.daddr = ip_hdr(skb)->saddr;
280                 fl4.saddr = 0;
281                 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
282                 fl4.flowi4_scope = scope;
283                 fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
284                 fl4.flowi4_tun_key.tun_id = 0;
285                 if (!fib_lookup(net, &fl4, &res, 0))
286                         return FIB_RES_PREFSRC(net, res);
287         } else {
288                 scope = RT_SCOPE_LINK;
289         }
290
291         return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
292 }
293
294 /* Given (packet source, input interface) and optional (dst, oif, tos):
295  * - (main) check, that source is valid i.e. not broadcast or our local
296  *   address.
297  * - figure out what "logical" interface this packet arrived
298  *   and calculate "specific destination" address.
299  * - check, that packet arrived from expected physical interface.
300  * called with rcu_read_lock()
301  */
302 static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
303                                  u8 tos, int oif, struct net_device *dev,
304                                  int rpf, struct in_device *idev, u32 *itag)
305 {
306         int ret, no_addr;
307         struct fib_result res;
308         struct flowi4 fl4;
309         struct net *net;
310         bool dev_match;
311
312         fl4.flowi4_oif = 0;
313         fl4.flowi4_iif = vrf_master_ifindex_rcu(dev);
314         if (!fl4.flowi4_iif)
315                 fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
316         fl4.daddr = src;
317         fl4.saddr = dst;
318         fl4.flowi4_tos = tos;
319         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
320         fl4.flowi4_tun_key.tun_id = 0;
321
322         no_addr = idev->ifa_list == NULL;
323
324         fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
325
326         net = dev_net(dev);
327         if (fib_lookup(net, &fl4, &res, 0))
328                 goto last_resort;
329         if (res.type != RTN_UNICAST &&
330             (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
331                 goto e_inval;
332         if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
333             (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
334                 goto last_resort;
335         fib_combine_itag(itag, &res);
336         dev_match = false;
337
338 #ifdef CONFIG_IP_ROUTE_MULTIPATH
339         for (ret = 0; ret < res.fi->fib_nhs; ret++) {
340                 struct fib_nh *nh = &res.fi->fib_nh[ret];
341
342                 if (nh->nh_dev == dev) {
343                         dev_match = true;
344                         break;
345                 } else if (vrf_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
346                         dev_match = true;
347                         break;
348                 }
349         }
350 #else
351         if (FIB_RES_DEV(res) == dev)
352                 dev_match = true;
353 #endif
354         if (dev_match) {
355                 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
356                 return ret;
357         }
358         if (no_addr)
359                 goto last_resort;
360         if (rpf == 1)
361                 goto e_rpf;
362         fl4.flowi4_oif = dev->ifindex;
363
364         ret = 0;
365         if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
366                 if (res.type == RTN_UNICAST)
367                         ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
368         }
369         return ret;
370
371 last_resort:
372         if (rpf)
373                 goto e_rpf;
374         *itag = 0;
375         return 0;
376
377 e_inval:
378         return -EINVAL;
379 e_rpf:
380         return -EXDEV;
381 }
382
383 /* Ignore rp_filter for packets protected by IPsec. */
384 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
385                         u8 tos, int oif, struct net_device *dev,
386                         struct in_device *idev, u32 *itag)
387 {
388         int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
389
390         if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
391             IN_DEV_ACCEPT_LOCAL(idev) &&
392             (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
393                 *itag = 0;
394                 return 0;
395         }
396         return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
397 }
398
399 static inline __be32 sk_extract_addr(struct sockaddr *addr)
400 {
401         return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
402 }
403
404 static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
405 {
406         struct nlattr *nla;
407
408         nla = (struct nlattr *) ((char *) mx + len);
409         nla->nla_type = type;
410         nla->nla_len = nla_attr_size(4);
411         *(u32 *) nla_data(nla) = value;
412
413         return len + nla_total_size(4);
414 }
415
416 static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
417                                  struct fib_config *cfg)
418 {
419         __be32 addr;
420         int plen;
421
422         memset(cfg, 0, sizeof(*cfg));
423         cfg->fc_nlinfo.nl_net = net;
424
425         if (rt->rt_dst.sa_family != AF_INET)
426                 return -EAFNOSUPPORT;
427
428         /*
429          * Check mask for validity:
430          * a) it must be contiguous.
431          * b) destination must have all host bits clear.
432          * c) if application forgot to set correct family (AF_INET),
433          *    reject request unless it is absolutely clear i.e.
434          *    both family and mask are zero.
435          */
436         plen = 32;
437         addr = sk_extract_addr(&rt->rt_dst);
438         if (!(rt->rt_flags & RTF_HOST)) {
439                 __be32 mask = sk_extract_addr(&rt->rt_genmask);
440
441                 if (rt->rt_genmask.sa_family != AF_INET) {
442                         if (mask || rt->rt_genmask.sa_family)
443                                 return -EAFNOSUPPORT;
444                 }
445
446                 if (bad_mask(mask, addr))
447                         return -EINVAL;
448
449                 plen = inet_mask_len(mask);
450         }
451
452         cfg->fc_dst_len = plen;
453         cfg->fc_dst = addr;
454
455         if (cmd != SIOCDELRT) {
456                 cfg->fc_nlflags = NLM_F_CREATE;
457                 cfg->fc_protocol = RTPROT_BOOT;
458         }
459
460         if (rt->rt_metric)
461                 cfg->fc_priority = rt->rt_metric - 1;
462
463         if (rt->rt_flags & RTF_REJECT) {
464                 cfg->fc_scope = RT_SCOPE_HOST;
465                 cfg->fc_type = RTN_UNREACHABLE;
466                 return 0;
467         }
468
469         cfg->fc_scope = RT_SCOPE_NOWHERE;
470         cfg->fc_type = RTN_UNICAST;
471
472         if (rt->rt_dev) {
473                 char *colon;
474                 struct net_device *dev;
475                 char devname[IFNAMSIZ];
476
477                 if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
478                         return -EFAULT;
479
480                 devname[IFNAMSIZ-1] = 0;
481                 colon = strchr(devname, ':');
482                 if (colon)
483                         *colon = 0;
484                 dev = __dev_get_by_name(net, devname);
485                 if (!dev)
486                         return -ENODEV;
487                 cfg->fc_oif = dev->ifindex;
488                 if (colon) {
489                         struct in_ifaddr *ifa;
490                         struct in_device *in_dev = __in_dev_get_rtnl(dev);
491                         if (!in_dev)
492                                 return -ENODEV;
493                         *colon = ':';
494                         for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
495                                 if (strcmp(ifa->ifa_label, devname) == 0)
496                                         break;
497                         if (!ifa)
498                                 return -ENODEV;
499                         cfg->fc_prefsrc = ifa->ifa_local;
500                 }
501         }
502
503         addr = sk_extract_addr(&rt->rt_gateway);
504         if (rt->rt_gateway.sa_family == AF_INET && addr) {
505                 cfg->fc_gw = addr;
506                 if (rt->rt_flags & RTF_GATEWAY &&
507                     inet_addr_type(net, addr) == RTN_UNICAST)
508                         cfg->fc_scope = RT_SCOPE_UNIVERSE;
509         }
510
511         if (cmd == SIOCDELRT)
512                 return 0;
513
514         if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
515                 return -EINVAL;
516
517         if (cfg->fc_scope == RT_SCOPE_NOWHERE)
518                 cfg->fc_scope = RT_SCOPE_LINK;
519
520         if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
521                 struct nlattr *mx;
522                 int len = 0;
523
524                 mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
525                 if (!mx)
526                         return -ENOMEM;
527
528                 if (rt->rt_flags & RTF_MTU)
529                         len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
530
531                 if (rt->rt_flags & RTF_WINDOW)
532                         len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
533
534                 if (rt->rt_flags & RTF_IRTT)
535                         len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
536
537                 cfg->fc_mx = mx;
538                 cfg->fc_mx_len = len;
539         }
540
541         return 0;
542 }
543
544 /*
545  * Handle IP routing ioctl calls.
546  * These are used to manipulate the routing tables
547  */
548 int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
549 {
550         struct fib_config cfg;
551         struct rtentry rt;
552         int err;
553
554         switch (cmd) {
555         case SIOCADDRT:         /* Add a route */
556         case SIOCDELRT:         /* Delete a route */
557                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
558                         return -EPERM;
559
560                 if (copy_from_user(&rt, arg, sizeof(rt)))
561                         return -EFAULT;
562
563                 rtnl_lock();
564                 err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
565                 if (err == 0) {
566                         struct fib_table *tb;
567
568                         if (cmd == SIOCDELRT) {
569                                 tb = fib_get_table(net, cfg.fc_table);
570                                 if (tb)
571                                         err = fib_table_delete(tb, &cfg);
572                                 else
573                                         err = -ESRCH;
574                         } else {
575                                 tb = fib_new_table(net, cfg.fc_table);
576                                 if (tb)
577                                         err = fib_table_insert(tb, &cfg);
578                                 else
579                                         err = -ENOBUFS;
580                         }
581
582                         /* allocated by rtentry_to_fib_config() */
583                         kfree(cfg.fc_mx);
584                 }
585                 rtnl_unlock();
586                 return err;
587         }
588         return -EINVAL;
589 }
590
591 const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
592         [RTA_DST]               = { .type = NLA_U32 },
593         [RTA_SRC]               = { .type = NLA_U32 },
594         [RTA_IIF]               = { .type = NLA_U32 },
595         [RTA_OIF]               = { .type = NLA_U32 },
596         [RTA_GATEWAY]           = { .type = NLA_U32 },
597         [RTA_PRIORITY]          = { .type = NLA_U32 },
598         [RTA_PREFSRC]           = { .type = NLA_U32 },
599         [RTA_METRICS]           = { .type = NLA_NESTED },
600         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
601         [RTA_FLOW]              = { .type = NLA_U32 },
602         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
603         [RTA_ENCAP]             = { .type = NLA_NESTED },
604 };
605
606 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
607                              struct nlmsghdr *nlh, struct fib_config *cfg)
608 {
609         struct nlattr *attr;
610         int err, remaining;
611         struct rtmsg *rtm;
612
613         err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
614         if (err < 0)
615                 goto errout;
616
617         memset(cfg, 0, sizeof(*cfg));
618
619         rtm = nlmsg_data(nlh);
620         cfg->fc_dst_len = rtm->rtm_dst_len;
621         cfg->fc_tos = rtm->rtm_tos;
622         cfg->fc_table = rtm->rtm_table;
623         cfg->fc_protocol = rtm->rtm_protocol;
624         cfg->fc_scope = rtm->rtm_scope;
625         cfg->fc_type = rtm->rtm_type;
626         cfg->fc_flags = rtm->rtm_flags;
627         cfg->fc_nlflags = nlh->nlmsg_flags;
628
629         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
630         cfg->fc_nlinfo.nlh = nlh;
631         cfg->fc_nlinfo.nl_net = net;
632
633         if (cfg->fc_type > RTN_MAX) {
634                 err = -EINVAL;
635                 goto errout;
636         }
637
638         nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
639                 switch (nla_type(attr)) {
640                 case RTA_DST:
641                         cfg->fc_dst = nla_get_be32(attr);
642                         break;
643                 case RTA_OIF:
644                         cfg->fc_oif = nla_get_u32(attr);
645                         break;
646                 case RTA_GATEWAY:
647                         cfg->fc_gw = nla_get_be32(attr);
648                         break;
649                 case RTA_PRIORITY:
650                         cfg->fc_priority = nla_get_u32(attr);
651                         break;
652                 case RTA_PREFSRC:
653                         cfg->fc_prefsrc = nla_get_be32(attr);
654                         break;
655                 case RTA_METRICS:
656                         cfg->fc_mx = nla_data(attr);
657                         cfg->fc_mx_len = nla_len(attr);
658                         break;
659                 case RTA_MULTIPATH:
660                         cfg->fc_mp = nla_data(attr);
661                         cfg->fc_mp_len = nla_len(attr);
662                         break;
663                 case RTA_FLOW:
664                         cfg->fc_flow = nla_get_u32(attr);
665                         break;
666                 case RTA_TABLE:
667                         cfg->fc_table = nla_get_u32(attr);
668                         break;
669                 case RTA_ENCAP:
670                         cfg->fc_encap = attr;
671                         break;
672                 case RTA_ENCAP_TYPE:
673                         cfg->fc_encap_type = nla_get_u16(attr);
674                         break;
675                 }
676         }
677
678         return 0;
679 errout:
680         return err;
681 }
682
683 static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
684 {
685         struct net *net = sock_net(skb->sk);
686         struct fib_config cfg;
687         struct fib_table *tb;
688         int err;
689
690         err = rtm_to_fib_config(net, skb, nlh, &cfg);
691         if (err < 0)
692                 goto errout;
693
694         tb = fib_get_table(net, cfg.fc_table);
695         if (!tb) {
696                 err = -ESRCH;
697                 goto errout;
698         }
699
700         err = fib_table_delete(tb, &cfg);
701 errout:
702         return err;
703 }
704
705 static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
706 {
707         struct net *net = sock_net(skb->sk);
708         struct fib_config cfg;
709         struct fib_table *tb;
710         int err;
711
712         err = rtm_to_fib_config(net, skb, nlh, &cfg);
713         if (err < 0)
714                 goto errout;
715
716         tb = fib_new_table(net, cfg.fc_table);
717         if (!tb) {
718                 err = -ENOBUFS;
719                 goto errout;
720         }
721
722         err = fib_table_insert(tb, &cfg);
723 errout:
724         return err;
725 }
726
727 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
728 {
729         struct net *net = sock_net(skb->sk);
730         unsigned int h, s_h;
731         unsigned int e = 0, s_e;
732         struct fib_table *tb;
733         struct hlist_head *head;
734         int dumped = 0;
735
736         if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
737             ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
738                 return skb->len;
739
740         s_h = cb->args[0];
741         s_e = cb->args[1];
742
743         rcu_read_lock();
744
745         for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
746                 e = 0;
747                 head = &net->ipv4.fib_table_hash[h];
748                 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
749                         if (e < s_e)
750                                 goto next;
751                         if (dumped)
752                                 memset(&cb->args[2], 0, sizeof(cb->args) -
753                                                  2 * sizeof(cb->args[0]));
754                         if (fib_table_dump(tb, skb, cb) < 0)
755                                 goto out;
756                         dumped = 1;
757 next:
758                         e++;
759                 }
760         }
761 out:
762         rcu_read_unlock();
763
764         cb->args[1] = e;
765         cb->args[0] = h;
766
767         return skb->len;
768 }
769
770 /* Prepare and feed intra-kernel routing request.
771  * Really, it should be netlink message, but :-( netlink
772  * can be not configured, so that we feed it directly
773  * to fib engine. It is legal, because all events occur
774  * only when netlink is already locked.
775  */
776 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
777 {
778         struct net *net = dev_net(ifa->ifa_dev->dev);
779         struct fib_table *tb;
780         struct fib_config cfg = {
781                 .fc_protocol = RTPROT_KERNEL,
782                 .fc_type = type,
783                 .fc_dst = dst,
784                 .fc_dst_len = dst_len,
785                 .fc_prefsrc = ifa->ifa_local,
786                 .fc_oif = ifa->ifa_dev->dev->ifindex,
787                 .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
788                 .fc_nlinfo = {
789                         .nl_net = net,
790                 },
791         };
792
793         if (type == RTN_UNICAST)
794                 tb = fib_new_table(net, RT_TABLE_MAIN);
795         else
796                 tb = fib_new_table(net, RT_TABLE_LOCAL);
797
798         if (!tb)
799                 return;
800
801         cfg.fc_table = tb->tb_id;
802
803         if (type != RTN_LOCAL)
804                 cfg.fc_scope = RT_SCOPE_LINK;
805         else
806                 cfg.fc_scope = RT_SCOPE_HOST;
807
808         if (cmd == RTM_NEWROUTE)
809                 fib_table_insert(tb, &cfg);
810         else
811                 fib_table_delete(tb, &cfg);
812 }
813
814 void fib_add_ifaddr(struct in_ifaddr *ifa)
815 {
816         struct in_device *in_dev = ifa->ifa_dev;
817         struct net_device *dev = in_dev->dev;
818         struct in_ifaddr *prim = ifa;
819         __be32 mask = ifa->ifa_mask;
820         __be32 addr = ifa->ifa_local;
821         __be32 prefix = ifa->ifa_address & mask;
822
823         if (ifa->ifa_flags & IFA_F_SECONDARY) {
824                 prim = inet_ifa_byprefix(in_dev, prefix, mask);
825                 if (!prim) {
826                         pr_warn("%s: bug: prim == NULL\n", __func__);
827                         return;
828                 }
829         }
830
831         fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
832
833         if (!(dev->flags & IFF_UP))
834                 return;
835
836         /* Add broadcast address, if it is explicitly assigned. */
837         if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
838                 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
839
840         if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
841             (prefix != addr || ifa->ifa_prefixlen < 32)) {
842                 fib_magic(RTM_NEWROUTE,
843                           dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
844                           prefix, ifa->ifa_prefixlen, prim);
845
846                 /* Add network specific broadcasts, when it takes a sense */
847                 if (ifa->ifa_prefixlen < 31) {
848                         fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
849                         fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
850                                   32, prim);
851                 }
852         }
853 }
854
855 /* Delete primary or secondary address.
856  * Optionally, on secondary address promotion consider the addresses
857  * from subnet iprim as deleted, even if they are in device list.
858  * In this case the secondary ifa can be in device list.
859  */
860 void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
861 {
862         struct in_device *in_dev = ifa->ifa_dev;
863         struct net_device *dev = in_dev->dev;
864         struct in_ifaddr *ifa1;
865         struct in_ifaddr *prim = ifa, *prim1 = NULL;
866         __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
867         __be32 any = ifa->ifa_address & ifa->ifa_mask;
868 #define LOCAL_OK        1
869 #define BRD_OK          2
870 #define BRD0_OK         4
871 #define BRD1_OK         8
872         unsigned int ok = 0;
873         int subnet = 0;         /* Primary network */
874         int gone = 1;           /* Address is missing */
875         int same_prefsrc = 0;   /* Another primary with same IP */
876
877         if (ifa->ifa_flags & IFA_F_SECONDARY) {
878                 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
879                 if (!prim) {
880                         pr_warn("%s: bug: prim == NULL\n", __func__);
881                         return;
882                 }
883                 if (iprim && iprim != prim) {
884                         pr_warn("%s: bug: iprim != prim\n", __func__);
885                         return;
886                 }
887         } else if (!ipv4_is_zeronet(any) &&
888                    (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
889                 fib_magic(RTM_DELROUTE,
890                           dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
891                           any, ifa->ifa_prefixlen, prim);
892                 subnet = 1;
893         }
894
895         /* Deletion is more complicated than add.
896          * We should take care of not to delete too much :-)
897          *
898          * Scan address list to be sure that addresses are really gone.
899          */
900
901         for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
902                 if (ifa1 == ifa) {
903                         /* promotion, keep the IP */
904                         gone = 0;
905                         continue;
906                 }
907                 /* Ignore IFAs from our subnet */
908                 if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
909                     inet_ifa_match(ifa1->ifa_address, iprim))
910                         continue;
911
912                 /* Ignore ifa1 if it uses different primary IP (prefsrc) */
913                 if (ifa1->ifa_flags & IFA_F_SECONDARY) {
914                         /* Another address from our subnet? */
915                         if (ifa1->ifa_mask == prim->ifa_mask &&
916                             inet_ifa_match(ifa1->ifa_address, prim))
917                                 prim1 = prim;
918                         else {
919                                 /* We reached the secondaries, so
920                                  * same_prefsrc should be determined.
921                                  */
922                                 if (!same_prefsrc)
923                                         continue;
924                                 /* Search new prim1 if ifa1 is not
925                                  * using the current prim1
926                                  */
927                                 if (!prim1 ||
928                                     ifa1->ifa_mask != prim1->ifa_mask ||
929                                     !inet_ifa_match(ifa1->ifa_address, prim1))
930                                         prim1 = inet_ifa_byprefix(in_dev,
931                                                         ifa1->ifa_address,
932                                                         ifa1->ifa_mask);
933                                 if (!prim1)
934                                         continue;
935                                 if (prim1->ifa_local != prim->ifa_local)
936                                         continue;
937                         }
938                 } else {
939                         if (prim->ifa_local != ifa1->ifa_local)
940                                 continue;
941                         prim1 = ifa1;
942                         if (prim != prim1)
943                                 same_prefsrc = 1;
944                 }
945                 if (ifa->ifa_local == ifa1->ifa_local)
946                         ok |= LOCAL_OK;
947                 if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
948                         ok |= BRD_OK;
949                 if (brd == ifa1->ifa_broadcast)
950                         ok |= BRD1_OK;
951                 if (any == ifa1->ifa_broadcast)
952                         ok |= BRD0_OK;
953                 /* primary has network specific broadcasts */
954                 if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
955                         __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
956                         __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
957
958                         if (!ipv4_is_zeronet(any1)) {
959                                 if (ifa->ifa_broadcast == brd1 ||
960                                     ifa->ifa_broadcast == any1)
961                                         ok |= BRD_OK;
962                                 if (brd == brd1 || brd == any1)
963                                         ok |= BRD1_OK;
964                                 if (any == brd1 || any == any1)
965                                         ok |= BRD0_OK;
966                         }
967                 }
968         }
969
970         if (!(ok & BRD_OK))
971                 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
972         if (subnet && ifa->ifa_prefixlen < 31) {
973                 if (!(ok & BRD1_OK))
974                         fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
975                 if (!(ok & BRD0_OK))
976                         fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
977         }
978         if (!(ok & LOCAL_OK)) {
979                 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
980
981                 /* Check, that this local address finally disappeared. */
982                 if (gone &&
983                     inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
984                         /* And the last, but not the least thing.
985                          * We must flush stray FIB entries.
986                          *
987                          * First of all, we scan fib_info list searching
988                          * for stray nexthop entries, then ignite fib_flush.
989                          */
990                         if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
991                                 fib_flush(dev_net(dev));
992                 }
993         }
994 #undef LOCAL_OK
995 #undef BRD_OK
996 #undef BRD0_OK
997 #undef BRD1_OK
998 }
999
1000 static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
1001 {
1002
1003         struct fib_result       res;
1004         struct flowi4           fl4 = {
1005                 .flowi4_mark = frn->fl_mark,
1006                 .daddr = frn->fl_addr,
1007                 .flowi4_tos = frn->fl_tos,
1008                 .flowi4_scope = frn->fl_scope,
1009         };
1010         struct fib_table *tb;
1011
1012         rcu_read_lock();
1013
1014         tb = fib_get_table(net, frn->tb_id_in);
1015
1016         frn->err = -ENOENT;
1017         if (tb) {
1018                 local_bh_disable();
1019
1020                 frn->tb_id = tb->tb_id;
1021                 frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
1022
1023                 if (!frn->err) {
1024                         frn->prefixlen = res.prefixlen;
1025                         frn->nh_sel = res.nh_sel;
1026                         frn->type = res.type;
1027                         frn->scope = res.scope;
1028                 }
1029                 local_bh_enable();
1030         }
1031
1032         rcu_read_unlock();
1033 }
1034
1035 static void nl_fib_input(struct sk_buff *skb)
1036 {
1037         struct net *net;
1038         struct fib_result_nl *frn;
1039         struct nlmsghdr *nlh;
1040         u32 portid;
1041
1042         net = sock_net(skb->sk);
1043         nlh = nlmsg_hdr(skb);
1044         if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len ||
1045             nlmsg_len(nlh) < sizeof(*frn))
1046                 return;
1047
1048         skb = netlink_skb_clone(skb, GFP_KERNEL);
1049         if (!skb)
1050                 return;
1051         nlh = nlmsg_hdr(skb);
1052
1053         frn = (struct fib_result_nl *) nlmsg_data(nlh);
1054         nl_fib_lookup(net, frn);
1055
1056         portid = NETLINK_CB(skb).portid;      /* netlink portid */
1057         NETLINK_CB(skb).portid = 0;        /* from kernel */
1058         NETLINK_CB(skb).dst_group = 0;  /* unicast */
1059         netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
1060 }
1061
1062 static int __net_init nl_fib_lookup_init(struct net *net)
1063 {
1064         struct sock *sk;
1065         struct netlink_kernel_cfg cfg = {
1066                 .input  = nl_fib_input,
1067         };
1068
1069         sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
1070         if (!sk)
1071                 return -EAFNOSUPPORT;
1072         net->ipv4.fibnl = sk;
1073         return 0;
1074 }
1075
1076 static void nl_fib_lookup_exit(struct net *net)
1077 {
1078         netlink_kernel_release(net->ipv4.fibnl);
1079         net->ipv4.fibnl = NULL;
1080 }
1081
1082 static void fib_disable_ip(struct net_device *dev, unsigned long event)
1083 {
1084         if (fib_sync_down_dev(dev, event))
1085                 fib_flush(dev_net(dev));
1086         rt_cache_flush(dev_net(dev));
1087         arp_ifdown(dev);
1088 }
1089
1090 static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
1091 {
1092         struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
1093         struct net_device *dev = ifa->ifa_dev->dev;
1094         struct net *net = dev_net(dev);
1095
1096         switch (event) {
1097         case NETDEV_UP:
1098                 fib_add_ifaddr(ifa);
1099 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1100                 fib_sync_up(dev, RTNH_F_DEAD);
1101 #endif
1102                 atomic_inc(&net->ipv4.dev_addr_genid);
1103                 rt_cache_flush(dev_net(dev));
1104                 break;
1105         case NETDEV_DOWN:
1106                 fib_del_ifaddr(ifa, NULL);
1107                 atomic_inc(&net->ipv4.dev_addr_genid);
1108                 if (!ifa->ifa_dev->ifa_list) {
1109                         /* Last address was deleted from this interface.
1110                          * Disable IP.
1111                          */
1112                         fib_disable_ip(dev, event);
1113                 } else {
1114                         rt_cache_flush(dev_net(dev));
1115                 }
1116                 break;
1117         }
1118         return NOTIFY_DONE;
1119 }
1120
1121 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1122 {
1123         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1124         struct in_device *in_dev;
1125         struct net *net = dev_net(dev);
1126         unsigned int flags;
1127
1128         if (event == NETDEV_UNREGISTER) {
1129                 fib_disable_ip(dev, event);
1130                 rt_flush_dev(dev);
1131                 return NOTIFY_DONE;
1132         }
1133
1134         in_dev = __in_dev_get_rtnl(dev);
1135         if (!in_dev)
1136                 return NOTIFY_DONE;
1137
1138         switch (event) {
1139         case NETDEV_UP:
1140                 for_ifa(in_dev) {
1141                         fib_add_ifaddr(ifa);
1142                 } endfor_ifa(in_dev);
1143 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1144                 fib_sync_up(dev, RTNH_F_DEAD);
1145 #endif
1146                 atomic_inc(&net->ipv4.dev_addr_genid);
1147                 rt_cache_flush(net);
1148                 break;
1149         case NETDEV_DOWN:
1150                 fib_disable_ip(dev, event);
1151                 break;
1152         case NETDEV_CHANGE:
1153                 flags = dev_get_flags(dev);
1154                 if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1155                         fib_sync_up(dev, RTNH_F_LINKDOWN);
1156                 else
1157                         fib_sync_down_dev(dev, event);
1158                 /* fall through */
1159         case NETDEV_CHANGEMTU:
1160                 rt_cache_flush(net);
1161                 break;
1162         }
1163         return NOTIFY_DONE;
1164 }
1165
1166 static struct notifier_block fib_inetaddr_notifier = {
1167         .notifier_call = fib_inetaddr_event,
1168 };
1169
1170 static struct notifier_block fib_netdev_notifier = {
1171         .notifier_call = fib_netdev_event,
1172 };
1173
1174 static int __net_init ip_fib_net_init(struct net *net)
1175 {
1176         int err;
1177         size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1178
1179         /* Avoid false sharing : Use at least a full cache line */
1180         size = max_t(size_t, size, L1_CACHE_BYTES);
1181
1182         net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1183         if (!net->ipv4.fib_table_hash)
1184                 return -ENOMEM;
1185
1186         err = fib4_rules_init(net);
1187         if (err < 0)
1188                 goto fail;
1189         return 0;
1190
1191 fail:
1192         kfree(net->ipv4.fib_table_hash);
1193         return err;
1194 }
1195
1196 static void ip_fib_net_exit(struct net *net)
1197 {
1198         unsigned int i;
1199
1200         rtnl_lock();
1201 #ifdef CONFIG_IP_MULTIPLE_TABLES
1202         RCU_INIT_POINTER(net->ipv4.fib_local, NULL);
1203         RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
1204         RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
1205 #endif
1206         for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1207                 struct hlist_head *head = &net->ipv4.fib_table_hash[i];
1208                 struct hlist_node *tmp;
1209                 struct fib_table *tb;
1210
1211                 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1212                         hlist_del(&tb->tb_hlist);
1213                         fib_table_flush(tb);
1214                         fib_free_table(tb);
1215                 }
1216         }
1217
1218 #ifdef CONFIG_IP_MULTIPLE_TABLES
1219         fib4_rules_exit(net);
1220 #endif
1221         rtnl_unlock();
1222         kfree(net->ipv4.fib_table_hash);
1223 }
1224
1225 static int __net_init fib_net_init(struct net *net)
1226 {
1227         int error;
1228
1229 #ifdef CONFIG_IP_ROUTE_CLASSID
1230         net->ipv4.fib_num_tclassid_users = 0;
1231 #endif
1232         error = ip_fib_net_init(net);
1233         if (error < 0)
1234                 goto out;
1235         error = nl_fib_lookup_init(net);
1236         if (error < 0)
1237                 goto out_nlfl;
1238         error = fib_proc_init(net);
1239         if (error < 0)
1240                 goto out_proc;
1241 out:
1242         return error;
1243
1244 out_proc:
1245         nl_fib_lookup_exit(net);
1246 out_nlfl:
1247         ip_fib_net_exit(net);
1248         goto out;
1249 }
1250
1251 static void __net_exit fib_net_exit(struct net *net)
1252 {
1253         fib_proc_exit(net);
1254         nl_fib_lookup_exit(net);
1255         ip_fib_net_exit(net);
1256 }
1257
1258 static struct pernet_operations fib_net_ops = {
1259         .init = fib_net_init,
1260         .exit = fib_net_exit,
1261 };
1262
1263 void __init ip_fib_init(void)
1264 {
1265         rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
1266         rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
1267         rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
1268
1269         register_pernet_subsys(&fib_net_ops);
1270         register_netdevice_notifier(&fib_netdev_notifier);
1271         register_inetaddr_notifier(&fib_inetaddr_notifier);
1272
1273         fib_trie_init();
1274 }