ipmr: RCU conversion of mroute_sk
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requirement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <linux/slab.h>
51 #include <net/net_namespace.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65 #include <net/netlink.h>
66 #include <net/fib_rules.h>
67
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM 1
70 #endif
71
72 struct mr_table {
73         struct list_head        list;
74 #ifdef CONFIG_NET_NS
75         struct net              *net;
76 #endif
77         u32                     id;
78         struct sock __rcu       *mroute_sk;
79         struct timer_list       ipmr_expire_timer;
80         struct list_head        mfc_unres_queue;
81         struct list_head        mfc_cache_array[MFC_LINES];
82         struct vif_device       vif_table[MAXVIFS];
83         int                     maxvif;
84         atomic_t                cache_resolve_queue_len;
85         int                     mroute_do_assert;
86         int                     mroute_do_pim;
87 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
88         int                     mroute_reg_vif_num;
89 #endif
90 };
91
92 struct ipmr_rule {
93         struct fib_rule         common;
94 };
95
96 struct ipmr_result {
97         struct mr_table         *mrt;
98 };
99
100 /* Big lock, protecting vif table, mrt cache and mroute socket state.
101    Note that the changes are semaphored via rtnl_lock.
102  */
103
104 static DEFINE_RWLOCK(mrt_lock);
105
106 /*
107  *      Multicast router control variables
108  */
109
110 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
111
112 /* Special spinlock for queue of unresolved entries */
113 static DEFINE_SPINLOCK(mfc_unres_lock);
114
115 /* We return to original Alan's scheme. Hash table of resolved
116    entries is changed only in process context and protected
117    with weak lock mrt_lock. Queue of unresolved entries is protected
118    with strong spinlock mfc_unres_lock.
119
120    In this case data path is free of exclusive locks at all.
121  */
122
123 static struct kmem_cache *mrt_cachep __read_mostly;
124
125 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
126 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
127                          struct sk_buff *skb, struct mfc_cache *cache,
128                          int local);
129 static int ipmr_cache_report(struct mr_table *mrt,
130                              struct sk_buff *pkt, vifi_t vifi, int assert);
131 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
132                               struct mfc_cache *c, struct rtmsg *rtm);
133 static void ipmr_expire_process(unsigned long arg);
134
135 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
136 #define ipmr_for_each_table(mrt, net) \
137         list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
138
139 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
140 {
141         struct mr_table *mrt;
142
143         ipmr_for_each_table(mrt, net) {
144                 if (mrt->id == id)
145                         return mrt;
146         }
147         return NULL;
148 }
149
150 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
151                            struct mr_table **mrt)
152 {
153         struct ipmr_result res;
154         struct fib_lookup_arg arg = { .result = &res, };
155         int err;
156
157         err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
158         if (err < 0)
159                 return err;
160         *mrt = res.mrt;
161         return 0;
162 }
163
164 static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
165                             int flags, struct fib_lookup_arg *arg)
166 {
167         struct ipmr_result *res = arg->result;
168         struct mr_table *mrt;
169
170         switch (rule->action) {
171         case FR_ACT_TO_TBL:
172                 break;
173         case FR_ACT_UNREACHABLE:
174                 return -ENETUNREACH;
175         case FR_ACT_PROHIBIT:
176                 return -EACCES;
177         case FR_ACT_BLACKHOLE:
178         default:
179                 return -EINVAL;
180         }
181
182         mrt = ipmr_get_table(rule->fr_net, rule->table);
183         if (mrt == NULL)
184                 return -EAGAIN;
185         res->mrt = mrt;
186         return 0;
187 }
188
189 static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
190 {
191         return 1;
192 }
193
194 static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
195         FRA_GENERIC_POLICY,
196 };
197
198 static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
199                                struct fib_rule_hdr *frh, struct nlattr **tb)
200 {
201         return 0;
202 }
203
204 static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
205                              struct nlattr **tb)
206 {
207         return 1;
208 }
209
210 static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
211                           struct fib_rule_hdr *frh)
212 {
213         frh->dst_len = 0;
214         frh->src_len = 0;
215         frh->tos     = 0;
216         return 0;
217 }
218
219 static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
220         .family         = RTNL_FAMILY_IPMR,
221         .rule_size      = sizeof(struct ipmr_rule),
222         .addr_size      = sizeof(u32),
223         .action         = ipmr_rule_action,
224         .match          = ipmr_rule_match,
225         .configure      = ipmr_rule_configure,
226         .compare        = ipmr_rule_compare,
227         .default_pref   = fib_default_rule_pref,
228         .fill           = ipmr_rule_fill,
229         .nlgroup        = RTNLGRP_IPV4_RULE,
230         .policy         = ipmr_rule_policy,
231         .owner          = THIS_MODULE,
232 };
233
234 static int __net_init ipmr_rules_init(struct net *net)
235 {
236         struct fib_rules_ops *ops;
237         struct mr_table *mrt;
238         int err;
239
240         ops = fib_rules_register(&ipmr_rules_ops_template, net);
241         if (IS_ERR(ops))
242                 return PTR_ERR(ops);
243
244         INIT_LIST_HEAD(&net->ipv4.mr_tables);
245
246         mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
247         if (mrt == NULL) {
248                 err = -ENOMEM;
249                 goto err1;
250         }
251
252         err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
253         if (err < 0)
254                 goto err2;
255
256         net->ipv4.mr_rules_ops = ops;
257         return 0;
258
259 err2:
260         kfree(mrt);
261 err1:
262         fib_rules_unregister(ops);
263         return err;
264 }
265
266 static void __net_exit ipmr_rules_exit(struct net *net)
267 {
268         struct mr_table *mrt, *next;
269
270         list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
271                 list_del(&mrt->list);
272                 kfree(mrt);
273         }
274         fib_rules_unregister(net->ipv4.mr_rules_ops);
275 }
276 #else
277 #define ipmr_for_each_table(mrt, net) \
278         for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
279
280 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
281 {
282         return net->ipv4.mrt;
283 }
284
285 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
286                            struct mr_table **mrt)
287 {
288         *mrt = net->ipv4.mrt;
289         return 0;
290 }
291
292 static int __net_init ipmr_rules_init(struct net *net)
293 {
294         net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
295         return net->ipv4.mrt ? 0 : -ENOMEM;
296 }
297
298 static void __net_exit ipmr_rules_exit(struct net *net)
299 {
300         kfree(net->ipv4.mrt);
301 }
302 #endif
303
304 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
305 {
306         struct mr_table *mrt;
307         unsigned int i;
308
309         mrt = ipmr_get_table(net, id);
310         if (mrt != NULL)
311                 return mrt;
312
313         mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
314         if (mrt == NULL)
315                 return NULL;
316         write_pnet(&mrt->net, net);
317         mrt->id = id;
318
319         /* Forwarding cache */
320         for (i = 0; i < MFC_LINES; i++)
321                 INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
322
323         INIT_LIST_HEAD(&mrt->mfc_unres_queue);
324
325         setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
326                     (unsigned long)mrt);
327
328 #ifdef CONFIG_IP_PIMSM
329         mrt->mroute_reg_vif_num = -1;
330 #endif
331 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
332         list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
333 #endif
334         return mrt;
335 }
336
337 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
338
339 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
340 {
341         struct net *net = dev_net(dev);
342
343         dev_close(dev);
344
345         dev = __dev_get_by_name(net, "tunl0");
346         if (dev) {
347                 const struct net_device_ops *ops = dev->netdev_ops;
348                 struct ifreq ifr;
349                 struct ip_tunnel_parm p;
350
351                 memset(&p, 0, sizeof(p));
352                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
353                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
354                 p.iph.version = 4;
355                 p.iph.ihl = 5;
356                 p.iph.protocol = IPPROTO_IPIP;
357                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
358                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
359
360                 if (ops->ndo_do_ioctl) {
361                         mm_segment_t oldfs = get_fs();
362
363                         set_fs(KERNEL_DS);
364                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
365                         set_fs(oldfs);
366                 }
367         }
368 }
369
370 static
371 struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
372 {
373         struct net_device  *dev;
374
375         dev = __dev_get_by_name(net, "tunl0");
376
377         if (dev) {
378                 const struct net_device_ops *ops = dev->netdev_ops;
379                 int err;
380                 struct ifreq ifr;
381                 struct ip_tunnel_parm p;
382                 struct in_device  *in_dev;
383
384                 memset(&p, 0, sizeof(p));
385                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
386                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
387                 p.iph.version = 4;
388                 p.iph.ihl = 5;
389                 p.iph.protocol = IPPROTO_IPIP;
390                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
391                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
392
393                 if (ops->ndo_do_ioctl) {
394                         mm_segment_t oldfs = get_fs();
395
396                         set_fs(KERNEL_DS);
397                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
398                         set_fs(oldfs);
399                 } else
400                         err = -EOPNOTSUPP;
401
402                 dev = NULL;
403
404                 if (err == 0 &&
405                     (dev = __dev_get_by_name(net, p.name)) != NULL) {
406                         dev->flags |= IFF_MULTICAST;
407
408                         in_dev = __in_dev_get_rtnl(dev);
409                         if (in_dev == NULL)
410                                 goto failure;
411
412                         ipv4_devconf_setall(in_dev);
413                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
414
415                         if (dev_open(dev))
416                                 goto failure;
417                         dev_hold(dev);
418                 }
419         }
420         return dev;
421
422 failure:
423         /* allow the register to be completed before unregistering. */
424         rtnl_unlock();
425         rtnl_lock();
426
427         unregister_netdevice(dev);
428         return NULL;
429 }
430
431 #ifdef CONFIG_IP_PIMSM
432
433 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
434 {
435         struct net *net = dev_net(dev);
436         struct mr_table *mrt;
437         struct flowi fl = {
438                 .oif            = dev->ifindex,
439                 .iif            = skb->skb_iif,
440                 .mark           = skb->mark,
441         };
442         int err;
443
444         err = ipmr_fib_lookup(net, &fl, &mrt);
445         if (err < 0) {
446                 kfree_skb(skb);
447                 return err;
448         }
449
450         read_lock(&mrt_lock);
451         dev->stats.tx_bytes += skb->len;
452         dev->stats.tx_packets++;
453         ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
454         read_unlock(&mrt_lock);
455         kfree_skb(skb);
456         return NETDEV_TX_OK;
457 }
458
459 static const struct net_device_ops reg_vif_netdev_ops = {
460         .ndo_start_xmit = reg_vif_xmit,
461 };
462
463 static void reg_vif_setup(struct net_device *dev)
464 {
465         dev->type               = ARPHRD_PIMREG;
466         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
467         dev->flags              = IFF_NOARP;
468         dev->netdev_ops         = &reg_vif_netdev_ops,
469         dev->destructor         = free_netdev;
470         dev->features           |= NETIF_F_NETNS_LOCAL;
471 }
472
473 static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
474 {
475         struct net_device *dev;
476         struct in_device *in_dev;
477         char name[IFNAMSIZ];
478
479         if (mrt->id == RT_TABLE_DEFAULT)
480                 sprintf(name, "pimreg");
481         else
482                 sprintf(name, "pimreg%u", mrt->id);
483
484         dev = alloc_netdev(0, name, reg_vif_setup);
485
486         if (dev == NULL)
487                 return NULL;
488
489         dev_net_set(dev, net);
490
491         if (register_netdevice(dev)) {
492                 free_netdev(dev);
493                 return NULL;
494         }
495         dev->iflink = 0;
496
497         rcu_read_lock();
498         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
499                 rcu_read_unlock();
500                 goto failure;
501         }
502
503         ipv4_devconf_setall(in_dev);
504         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
505         rcu_read_unlock();
506
507         if (dev_open(dev))
508                 goto failure;
509
510         dev_hold(dev);
511
512         return dev;
513
514 failure:
515         /* allow the register to be completed before unregistering. */
516         rtnl_unlock();
517         rtnl_lock();
518
519         unregister_netdevice(dev);
520         return NULL;
521 }
522 #endif
523
524 /*
525  *      Delete a VIF entry
526  *      @notify: Set to 1, if the caller is a notifier_call
527  */
528
529 static int vif_delete(struct mr_table *mrt, int vifi, int notify,
530                       struct list_head *head)
531 {
532         struct vif_device *v;
533         struct net_device *dev;
534         struct in_device *in_dev;
535
536         if (vifi < 0 || vifi >= mrt->maxvif)
537                 return -EADDRNOTAVAIL;
538
539         v = &mrt->vif_table[vifi];
540
541         write_lock_bh(&mrt_lock);
542         dev = v->dev;
543         v->dev = NULL;
544
545         if (!dev) {
546                 write_unlock_bh(&mrt_lock);
547                 return -EADDRNOTAVAIL;
548         }
549
550 #ifdef CONFIG_IP_PIMSM
551         if (vifi == mrt->mroute_reg_vif_num)
552                 mrt->mroute_reg_vif_num = -1;
553 #endif
554
555         if (vifi+1 == mrt->maxvif) {
556                 int tmp;
557                 for (tmp=vifi-1; tmp>=0; tmp--) {
558                         if (VIF_EXISTS(mrt, tmp))
559                                 break;
560                 }
561                 mrt->maxvif = tmp+1;
562         }
563
564         write_unlock_bh(&mrt_lock);
565
566         dev_set_allmulti(dev, -1);
567
568         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
569                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
570                 ip_rt_multicast_event(in_dev);
571         }
572
573         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
574                 unregister_netdevice_queue(dev, head);
575
576         dev_put(dev);
577         return 0;
578 }
579
580 static inline void ipmr_cache_free(struct mfc_cache *c)
581 {
582         kmem_cache_free(mrt_cachep, c);
583 }
584
585 /* Destroy an unresolved cache entry, killing queued skbs
586    and reporting error to netlink readers.
587  */
588
589 static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
590 {
591         struct net *net = read_pnet(&mrt->net);
592         struct sk_buff *skb;
593         struct nlmsgerr *e;
594
595         atomic_dec(&mrt->cache_resolve_queue_len);
596
597         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
598                 if (ip_hdr(skb)->version == 0) {
599                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
600                         nlh->nlmsg_type = NLMSG_ERROR;
601                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
602                         skb_trim(skb, nlh->nlmsg_len);
603                         e = NLMSG_DATA(nlh);
604                         e->error = -ETIMEDOUT;
605                         memset(&e->msg, 0, sizeof(e->msg));
606
607                         rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
608                 } else
609                         kfree_skb(skb);
610         }
611
612         ipmr_cache_free(c);
613 }
614
615
616 /* Timer process for the unresolved queue. */
617
618 static void ipmr_expire_process(unsigned long arg)
619 {
620         struct mr_table *mrt = (struct mr_table *)arg;
621         unsigned long now;
622         unsigned long expires;
623         struct mfc_cache *c, *next;
624
625         if (!spin_trylock(&mfc_unres_lock)) {
626                 mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
627                 return;
628         }
629
630         if (list_empty(&mrt->mfc_unres_queue))
631                 goto out;
632
633         now = jiffies;
634         expires = 10*HZ;
635
636         list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
637                 if (time_after(c->mfc_un.unres.expires, now)) {
638                         unsigned long interval = c->mfc_un.unres.expires - now;
639                         if (interval < expires)
640                                 expires = interval;
641                         continue;
642                 }
643
644                 list_del(&c->list);
645                 ipmr_destroy_unres(mrt, c);
646         }
647
648         if (!list_empty(&mrt->mfc_unres_queue))
649                 mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
650
651 out:
652         spin_unlock(&mfc_unres_lock);
653 }
654
655 /* Fill oifs list. It is called under write locked mrt_lock. */
656
657 static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
658                                    unsigned char *ttls)
659 {
660         int vifi;
661
662         cache->mfc_un.res.minvif = MAXVIFS;
663         cache->mfc_un.res.maxvif = 0;
664         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
665
666         for (vifi = 0; vifi < mrt->maxvif; vifi++) {
667                 if (VIF_EXISTS(mrt, vifi) &&
668                     ttls[vifi] && ttls[vifi] < 255) {
669                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
670                         if (cache->mfc_un.res.minvif > vifi)
671                                 cache->mfc_un.res.minvif = vifi;
672                         if (cache->mfc_un.res.maxvif <= vifi)
673                                 cache->mfc_un.res.maxvif = vifi + 1;
674                 }
675         }
676 }
677
678 static int vif_add(struct net *net, struct mr_table *mrt,
679                    struct vifctl *vifc, int mrtsock)
680 {
681         int vifi = vifc->vifc_vifi;
682         struct vif_device *v = &mrt->vif_table[vifi];
683         struct net_device *dev;
684         struct in_device *in_dev;
685         int err;
686
687         /* Is vif busy ? */
688         if (VIF_EXISTS(mrt, vifi))
689                 return -EADDRINUSE;
690
691         switch (vifc->vifc_flags) {
692 #ifdef CONFIG_IP_PIMSM
693         case VIFF_REGISTER:
694                 /*
695                  * Special Purpose VIF in PIM
696                  * All the packets will be sent to the daemon
697                  */
698                 if (mrt->mroute_reg_vif_num >= 0)
699                         return -EADDRINUSE;
700                 dev = ipmr_reg_vif(net, mrt);
701                 if (!dev)
702                         return -ENOBUFS;
703                 err = dev_set_allmulti(dev, 1);
704                 if (err) {
705                         unregister_netdevice(dev);
706                         dev_put(dev);
707                         return err;
708                 }
709                 break;
710 #endif
711         case VIFF_TUNNEL:
712                 dev = ipmr_new_tunnel(net, vifc);
713                 if (!dev)
714                         return -ENOBUFS;
715                 err = dev_set_allmulti(dev, 1);
716                 if (err) {
717                         ipmr_del_tunnel(dev, vifc);
718                         dev_put(dev);
719                         return err;
720                 }
721                 break;
722
723         case VIFF_USE_IFINDEX:
724         case 0:
725                 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
726                         dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
727                         if (dev && __in_dev_get_rtnl(dev) == NULL) {
728                                 dev_put(dev);
729                                 return -EADDRNOTAVAIL;
730                         }
731                 } else
732                         dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
733
734                 if (!dev)
735                         return -EADDRNOTAVAIL;
736                 err = dev_set_allmulti(dev, 1);
737                 if (err) {
738                         dev_put(dev);
739                         return err;
740                 }
741                 break;
742         default:
743                 return -EINVAL;
744         }
745
746         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
747                 dev_put(dev);
748                 return -EADDRNOTAVAIL;
749         }
750         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
751         ip_rt_multicast_event(in_dev);
752
753         /*
754          *      Fill in the VIF structures
755          */
756         v->rate_limit = vifc->vifc_rate_limit;
757         v->local = vifc->vifc_lcl_addr.s_addr;
758         v->remote = vifc->vifc_rmt_addr.s_addr;
759         v->flags = vifc->vifc_flags;
760         if (!mrtsock)
761                 v->flags |= VIFF_STATIC;
762         v->threshold = vifc->vifc_threshold;
763         v->bytes_in = 0;
764         v->bytes_out = 0;
765         v->pkt_in = 0;
766         v->pkt_out = 0;
767         v->link = dev->ifindex;
768         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
769                 v->link = dev->iflink;
770
771         /* And finish update writing critical data */
772         write_lock_bh(&mrt_lock);
773         v->dev = dev;
774 #ifdef CONFIG_IP_PIMSM
775         if (v->flags&VIFF_REGISTER)
776                 mrt->mroute_reg_vif_num = vifi;
777 #endif
778         if (vifi+1 > mrt->maxvif)
779                 mrt->maxvif = vifi+1;
780         write_unlock_bh(&mrt_lock);
781         return 0;
782 }
783
784 static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
785                                          __be32 origin,
786                                          __be32 mcastgrp)
787 {
788         int line = MFC_HASH(mcastgrp, origin);
789         struct mfc_cache *c;
790
791         list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
792                 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
793                         return c;
794         }
795         return NULL;
796 }
797
798 /*
799  *      Allocate a multicast cache entry
800  */
801 static struct mfc_cache *ipmr_cache_alloc(void)
802 {
803         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
804         if (c == NULL)
805                 return NULL;
806         c->mfc_un.res.minvif = MAXVIFS;
807         return c;
808 }
809
810 static struct mfc_cache *ipmr_cache_alloc_unres(void)
811 {
812         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
813         if (c == NULL)
814                 return NULL;
815         skb_queue_head_init(&c->mfc_un.unres.unresolved);
816         c->mfc_un.unres.expires = jiffies + 10*HZ;
817         return c;
818 }
819
820 /*
821  *      A cache entry has gone into a resolved state from queued
822  */
823
824 static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
825                                struct mfc_cache *uc, struct mfc_cache *c)
826 {
827         struct sk_buff *skb;
828         struct nlmsgerr *e;
829
830         /*
831          *      Play the pending entries through our router
832          */
833
834         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
835                 if (ip_hdr(skb)->version == 0) {
836                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
837
838                         if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
839                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
840                                                   (u8 *)nlh);
841                         } else {
842                                 nlh->nlmsg_type = NLMSG_ERROR;
843                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
844                                 skb_trim(skb, nlh->nlmsg_len);
845                                 e = NLMSG_DATA(nlh);
846                                 e->error = -EMSGSIZE;
847                                 memset(&e->msg, 0, sizeof(e->msg));
848                         }
849
850                         rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
851                 } else
852                         ip_mr_forward(net, mrt, skb, c, 0);
853         }
854 }
855
856 /*
857  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
858  *      expects the following bizarre scheme.
859  *
860  *      Called under mrt_lock.
861  */
862
863 static int ipmr_cache_report(struct mr_table *mrt,
864                              struct sk_buff *pkt, vifi_t vifi, int assert)
865 {
866         struct sk_buff *skb;
867         const int ihl = ip_hdrlen(pkt);
868         struct igmphdr *igmp;
869         struct igmpmsg *msg;
870         struct sock *mroute_sk;
871         int ret;
872
873 #ifdef CONFIG_IP_PIMSM
874         if (assert == IGMPMSG_WHOLEPKT)
875                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
876         else
877 #endif
878                 skb = alloc_skb(128, GFP_ATOMIC);
879
880         if (!skb)
881                 return -ENOBUFS;
882
883 #ifdef CONFIG_IP_PIMSM
884         if (assert == IGMPMSG_WHOLEPKT) {
885                 /* Ugly, but we have no choice with this interface.
886                    Duplicate old header, fix ihl, length etc.
887                    And all this only to mangle msg->im_msgtype and
888                    to set msg->im_mbz to "mbz" :-)
889                  */
890                 skb_push(skb, sizeof(struct iphdr));
891                 skb_reset_network_header(skb);
892                 skb_reset_transport_header(skb);
893                 msg = (struct igmpmsg *)skb_network_header(skb);
894                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
895                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
896                 msg->im_mbz = 0;
897                 msg->im_vif = mrt->mroute_reg_vif_num;
898                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
899                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
900                                              sizeof(struct iphdr));
901         } else
902 #endif
903         {
904
905         /*
906          *      Copy the IP header
907          */
908
909         skb->network_header = skb->tail;
910         skb_put(skb, ihl);
911         skb_copy_to_linear_data(skb, pkt->data, ihl);
912         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
913         msg = (struct igmpmsg *)skb_network_header(skb);
914         msg->im_vif = vifi;
915         skb_dst_set(skb, dst_clone(skb_dst(pkt)));
916
917         /*
918          *      Add our header
919          */
920
921         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
922         igmp->type      =
923         msg->im_msgtype = assert;
924         igmp->code      =       0;
925         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
926         skb->transport_header = skb->network_header;
927         }
928
929         rcu_read_lock();
930         mroute_sk = rcu_dereference(mrt->mroute_sk);
931         if (mroute_sk == NULL) {
932                 rcu_read_unlock();
933                 kfree_skb(skb);
934                 return -EINVAL;
935         }
936
937         /*
938          *      Deliver to mrouted
939          */
940         ret = sock_queue_rcv_skb(mroute_sk, skb);
941         rcu_read_unlock();
942         if (ret < 0) {
943                 if (net_ratelimit())
944                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
945                 kfree_skb(skb);
946         }
947
948         return ret;
949 }
950
951 /*
952  *      Queue a packet for resolution. It gets locked cache entry!
953  */
954
955 static int
956 ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
957 {
958         bool found = false;
959         int err;
960         struct mfc_cache *c;
961         const struct iphdr *iph = ip_hdr(skb);
962
963         spin_lock_bh(&mfc_unres_lock);
964         list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
965                 if (c->mfc_mcastgrp == iph->daddr &&
966                     c->mfc_origin == iph->saddr) {
967                         found = true;
968                         break;
969                 }
970         }
971
972         if (!found) {
973                 /*
974                  *      Create a new entry if allowable
975                  */
976
977                 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
978                     (c = ipmr_cache_alloc_unres()) == NULL) {
979                         spin_unlock_bh(&mfc_unres_lock);
980
981                         kfree_skb(skb);
982                         return -ENOBUFS;
983                 }
984
985                 /*
986                  *      Fill in the new cache entry
987                  */
988                 c->mfc_parent   = -1;
989                 c->mfc_origin   = iph->saddr;
990                 c->mfc_mcastgrp = iph->daddr;
991
992                 /*
993                  *      Reflect first query at mrouted.
994                  */
995                 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
996                 if (err < 0) {
997                         /* If the report failed throw the cache entry
998                            out - Brad Parker
999                          */
1000                         spin_unlock_bh(&mfc_unres_lock);
1001
1002                         ipmr_cache_free(c);
1003                         kfree_skb(skb);
1004                         return err;
1005                 }
1006
1007                 atomic_inc(&mrt->cache_resolve_queue_len);
1008                 list_add(&c->list, &mrt->mfc_unres_queue);
1009
1010                 if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1011                         mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1012         }
1013
1014         /*
1015          *      See if we can append the packet
1016          */
1017         if (c->mfc_un.unres.unresolved.qlen>3) {
1018                 kfree_skb(skb);
1019                 err = -ENOBUFS;
1020         } else {
1021                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
1022                 err = 0;
1023         }
1024
1025         spin_unlock_bh(&mfc_unres_lock);
1026         return err;
1027 }
1028
1029 /*
1030  *      MFC cache manipulation by user space mroute daemon
1031  */
1032
1033 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1034 {
1035         int line;
1036         struct mfc_cache *c, *next;
1037
1038         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1039
1040         list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1041                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1042                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1043                         write_lock_bh(&mrt_lock);
1044                         list_del(&c->list);
1045                         write_unlock_bh(&mrt_lock);
1046
1047                         ipmr_cache_free(c);
1048                         return 0;
1049                 }
1050         }
1051         return -ENOENT;
1052 }
1053
1054 static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1055                         struct mfcctl *mfc, int mrtsock)
1056 {
1057         bool found = false;
1058         int line;
1059         struct mfc_cache *uc, *c;
1060
1061         if (mfc->mfcc_parent >= MAXVIFS)
1062                 return -ENFILE;
1063
1064         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1065
1066         list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
1067                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1068                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1069                         found = true;
1070                         break;
1071                 }
1072         }
1073
1074         if (found) {
1075                 write_lock_bh(&mrt_lock);
1076                 c->mfc_parent = mfc->mfcc_parent;
1077                 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1078                 if (!mrtsock)
1079                         c->mfc_flags |= MFC_STATIC;
1080                 write_unlock_bh(&mrt_lock);
1081                 return 0;
1082         }
1083
1084         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
1085                 return -EINVAL;
1086
1087         c = ipmr_cache_alloc();
1088         if (c == NULL)
1089                 return -ENOMEM;
1090
1091         c->mfc_origin = mfc->mfcc_origin.s_addr;
1092         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
1093         c->mfc_parent = mfc->mfcc_parent;
1094         ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1095         if (!mrtsock)
1096                 c->mfc_flags |= MFC_STATIC;
1097
1098         write_lock_bh(&mrt_lock);
1099         list_add(&c->list, &mrt->mfc_cache_array[line]);
1100         write_unlock_bh(&mrt_lock);
1101
1102         /*
1103          *      Check to see if we resolved a queued list. If so we
1104          *      need to send on the frames and tidy up.
1105          */
1106         found = false;
1107         spin_lock_bh(&mfc_unres_lock);
1108         list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
1109                 if (uc->mfc_origin == c->mfc_origin &&
1110                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1111                         list_del(&uc->list);
1112                         atomic_dec(&mrt->cache_resolve_queue_len);
1113                         found = true;
1114                         break;
1115                 }
1116         }
1117         if (list_empty(&mrt->mfc_unres_queue))
1118                 del_timer(&mrt->ipmr_expire_timer);
1119         spin_unlock_bh(&mfc_unres_lock);
1120
1121         if (found) {
1122                 ipmr_cache_resolve(net, mrt, uc, c);
1123                 ipmr_cache_free(uc);
1124         }
1125         return 0;
1126 }
1127
1128 /*
1129  *      Close the multicast socket, and clear the vif tables etc
1130  */
1131
1132 static void mroute_clean_tables(struct mr_table *mrt)
1133 {
1134         int i;
1135         LIST_HEAD(list);
1136         struct mfc_cache *c, *next;
1137
1138         /*
1139          *      Shut down all active vif entries
1140          */
1141         for (i = 0; i < mrt->maxvif; i++) {
1142                 if (!(mrt->vif_table[i].flags&VIFF_STATIC))
1143                         vif_delete(mrt, i, 0, &list);
1144         }
1145         unregister_netdevice_many(&list);
1146
1147         /*
1148          *      Wipe the cache
1149          */
1150         for (i = 0; i < MFC_LINES; i++) {
1151                 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1152                         if (c->mfc_flags&MFC_STATIC)
1153                                 continue;
1154                         write_lock_bh(&mrt_lock);
1155                         list_del(&c->list);
1156                         write_unlock_bh(&mrt_lock);
1157
1158                         ipmr_cache_free(c);
1159                 }
1160         }
1161
1162         if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
1163                 spin_lock_bh(&mfc_unres_lock);
1164                 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
1165                         list_del(&c->list);
1166                         ipmr_destroy_unres(mrt, c);
1167                 }
1168                 spin_unlock_bh(&mfc_unres_lock);
1169         }
1170 }
1171
1172 /* called from ip_ra_control(), before an RCU grace period,
1173  * we dont need to call synchronize_rcu() here
1174  */
1175 static void mrtsock_destruct(struct sock *sk)
1176 {
1177         struct net *net = sock_net(sk);
1178         struct mr_table *mrt;
1179
1180         rtnl_lock();
1181         ipmr_for_each_table(mrt, net) {
1182                 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1183                         IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1184                         rcu_assign_pointer(mrt->mroute_sk, NULL);
1185                         mroute_clean_tables(mrt);
1186                 }
1187         }
1188         rtnl_unlock();
1189 }
1190
1191 /*
1192  *      Socket options and virtual interface manipulation. The whole
1193  *      virtual interface system is a complete heap, but unfortunately
1194  *      that's how BSD mrouted happens to think. Maybe one day with a proper
1195  *      MOSPF/PIM router set up we can clean this up.
1196  */
1197
1198 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
1199 {
1200         int ret;
1201         struct vifctl vif;
1202         struct mfcctl mfc;
1203         struct net *net = sock_net(sk);
1204         struct mr_table *mrt;
1205
1206         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1207         if (mrt == NULL)
1208                 return -ENOENT;
1209
1210         if (optname != MRT_INIT) {
1211                 if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
1212                     !capable(CAP_NET_ADMIN))
1213                         return -EACCES;
1214         }
1215
1216         switch (optname) {
1217         case MRT_INIT:
1218                 if (sk->sk_type != SOCK_RAW ||
1219                     inet_sk(sk)->inet_num != IPPROTO_IGMP)
1220                         return -EOPNOTSUPP;
1221                 if (optlen != sizeof(int))
1222                         return -ENOPROTOOPT;
1223
1224                 rtnl_lock();
1225                 if (rtnl_dereference(mrt->mroute_sk)) {
1226                         rtnl_unlock();
1227                         return -EADDRINUSE;
1228                 }
1229
1230                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
1231                 if (ret == 0) {
1232                         rcu_assign_pointer(mrt->mroute_sk, sk);
1233                         IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1234                 }
1235                 rtnl_unlock();
1236                 return ret;
1237         case MRT_DONE:
1238                 if (sk != rcu_dereference_raw(mrt->mroute_sk))
1239                         return -EACCES;
1240                 return ip_ra_control(sk, 0, NULL);
1241         case MRT_ADD_VIF:
1242         case MRT_DEL_VIF:
1243                 if (optlen != sizeof(vif))
1244                         return -EINVAL;
1245                 if (copy_from_user(&vif, optval, sizeof(vif)))
1246                         return -EFAULT;
1247                 if (vif.vifc_vifi >= MAXVIFS)
1248                         return -ENFILE;
1249                 rtnl_lock();
1250                 if (optname == MRT_ADD_VIF) {
1251                         ret = vif_add(net, mrt, &vif,
1252                                       sk == rtnl_dereference(mrt->mroute_sk));
1253                 } else {
1254                         ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1255                 }
1256                 rtnl_unlock();
1257                 return ret;
1258
1259                 /*
1260                  *      Manipulate the forwarding caches. These live
1261                  *      in a sort of kernel/user symbiosis.
1262                  */
1263         case MRT_ADD_MFC:
1264         case MRT_DEL_MFC:
1265                 if (optlen != sizeof(mfc))
1266                         return -EINVAL;
1267                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
1268                         return -EFAULT;
1269                 rtnl_lock();
1270                 if (optname == MRT_DEL_MFC)
1271                         ret = ipmr_mfc_delete(mrt, &mfc);
1272                 else
1273                         ret = ipmr_mfc_add(net, mrt, &mfc,
1274                                            sk == rtnl_dereference(mrt->mroute_sk));
1275                 rtnl_unlock();
1276                 return ret;
1277                 /*
1278                  *      Control PIM assert.
1279                  */
1280         case MRT_ASSERT:
1281         {
1282                 int v;
1283                 if (get_user(v,(int __user *)optval))
1284                         return -EFAULT;
1285                 mrt->mroute_do_assert = (v) ? 1 : 0;
1286                 return 0;
1287         }
1288 #ifdef CONFIG_IP_PIMSM
1289         case MRT_PIM:
1290         {
1291                 int v;
1292
1293                 if (get_user(v,(int __user *)optval))
1294                         return -EFAULT;
1295                 v = (v) ? 1 : 0;
1296
1297                 rtnl_lock();
1298                 ret = 0;
1299                 if (v != mrt->mroute_do_pim) {
1300                         mrt->mroute_do_pim = v;
1301                         mrt->mroute_do_assert = v;
1302                 }
1303                 rtnl_unlock();
1304                 return ret;
1305         }
1306 #endif
1307 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
1308         case MRT_TABLE:
1309         {
1310                 u32 v;
1311
1312                 if (optlen != sizeof(u32))
1313                         return -EINVAL;
1314                 if (get_user(v, (u32 __user *)optval))
1315                         return -EFAULT;
1316
1317                 rtnl_lock();
1318                 ret = 0;
1319                 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1320                         ret = -EBUSY;
1321                 } else {
1322                         if (!ipmr_new_table(net, v))
1323                                 ret = -ENOMEM;
1324                         raw_sk(sk)->ipmr_table = v;
1325                 }
1326                 rtnl_unlock();
1327                 return ret;
1328         }
1329 #endif
1330         /*
1331          *      Spurious command, or MRT_VERSION which you cannot
1332          *      set.
1333          */
1334         default:
1335                 return -ENOPROTOOPT;
1336         }
1337 }
1338
1339 /*
1340  *      Getsock opt support for the multicast routing system.
1341  */
1342
1343 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1344 {
1345         int olr;
1346         int val;
1347         struct net *net = sock_net(sk);
1348         struct mr_table *mrt;
1349
1350         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1351         if (mrt == NULL)
1352                 return -ENOENT;
1353
1354         if (optname != MRT_VERSION &&
1355 #ifdef CONFIG_IP_PIMSM
1356            optname!=MRT_PIM &&
1357 #endif
1358            optname!=MRT_ASSERT)
1359                 return -ENOPROTOOPT;
1360
1361         if (get_user(olr, optlen))
1362                 return -EFAULT;
1363
1364         olr = min_t(unsigned int, olr, sizeof(int));
1365         if (olr < 0)
1366                 return -EINVAL;
1367
1368         if (put_user(olr, optlen))
1369                 return -EFAULT;
1370         if (optname == MRT_VERSION)
1371                 val = 0x0305;
1372 #ifdef CONFIG_IP_PIMSM
1373         else if (optname == MRT_PIM)
1374                 val = mrt->mroute_do_pim;
1375 #endif
1376         else
1377                 val = mrt->mroute_do_assert;
1378         if (copy_to_user(optval, &val, olr))
1379                 return -EFAULT;
1380         return 0;
1381 }
1382
1383 /*
1384  *      The IP multicast ioctl support routines.
1385  */
1386
1387 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1388 {
1389         struct sioc_sg_req sr;
1390         struct sioc_vif_req vr;
1391         struct vif_device *vif;
1392         struct mfc_cache *c;
1393         struct net *net = sock_net(sk);
1394         struct mr_table *mrt;
1395
1396         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1397         if (mrt == NULL)
1398                 return -ENOENT;
1399
1400         switch (cmd) {
1401         case SIOCGETVIFCNT:
1402                 if (copy_from_user(&vr, arg, sizeof(vr)))
1403                         return -EFAULT;
1404                 if (vr.vifi >= mrt->maxvif)
1405                         return -EINVAL;
1406                 read_lock(&mrt_lock);
1407                 vif = &mrt->vif_table[vr.vifi];
1408                 if (VIF_EXISTS(mrt, vr.vifi)) {
1409                         vr.icount = vif->pkt_in;
1410                         vr.ocount = vif->pkt_out;
1411                         vr.ibytes = vif->bytes_in;
1412                         vr.obytes = vif->bytes_out;
1413                         read_unlock(&mrt_lock);
1414
1415                         if (copy_to_user(arg, &vr, sizeof(vr)))
1416                                 return -EFAULT;
1417                         return 0;
1418                 }
1419                 read_unlock(&mrt_lock);
1420                 return -EADDRNOTAVAIL;
1421         case SIOCGETSGCNT:
1422                 if (copy_from_user(&sr, arg, sizeof(sr)))
1423                         return -EFAULT;
1424
1425                 read_lock(&mrt_lock);
1426                 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1427                 if (c) {
1428                         sr.pktcnt = c->mfc_un.res.pkt;
1429                         sr.bytecnt = c->mfc_un.res.bytes;
1430                         sr.wrong_if = c->mfc_un.res.wrong_if;
1431                         read_unlock(&mrt_lock);
1432
1433                         if (copy_to_user(arg, &sr, sizeof(sr)))
1434                                 return -EFAULT;
1435                         return 0;
1436                 }
1437                 read_unlock(&mrt_lock);
1438                 return -EADDRNOTAVAIL;
1439         default:
1440                 return -ENOIOCTLCMD;
1441         }
1442 }
1443
1444
1445 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1446 {
1447         struct net_device *dev = ptr;
1448         struct net *net = dev_net(dev);
1449         struct mr_table *mrt;
1450         struct vif_device *v;
1451         int ct;
1452         LIST_HEAD(list);
1453
1454         if (event != NETDEV_UNREGISTER)
1455                 return NOTIFY_DONE;
1456
1457         ipmr_for_each_table(mrt, net) {
1458                 v = &mrt->vif_table[0];
1459                 for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1460                         if (v->dev == dev)
1461                                 vif_delete(mrt, ct, 1, &list);
1462                 }
1463         }
1464         unregister_netdevice_many(&list);
1465         return NOTIFY_DONE;
1466 }
1467
1468
1469 static struct notifier_block ip_mr_notifier = {
1470         .notifier_call = ipmr_device_event,
1471 };
1472
1473 /*
1474  *      Encapsulate a packet by attaching a valid IPIP header to it.
1475  *      This avoids tunnel drivers and other mess and gives us the speed so
1476  *      important for multicast video.
1477  */
1478
1479 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1480 {
1481         struct iphdr *iph;
1482         struct iphdr *old_iph = ip_hdr(skb);
1483
1484         skb_push(skb, sizeof(struct iphdr));
1485         skb->transport_header = skb->network_header;
1486         skb_reset_network_header(skb);
1487         iph = ip_hdr(skb);
1488
1489         iph->version    =       4;
1490         iph->tos        =       old_iph->tos;
1491         iph->ttl        =       old_iph->ttl;
1492         iph->frag_off   =       0;
1493         iph->daddr      =       daddr;
1494         iph->saddr      =       saddr;
1495         iph->protocol   =       IPPROTO_IPIP;
1496         iph->ihl        =       5;
1497         iph->tot_len    =       htons(skb->len);
1498         ip_select_ident(iph, skb_dst(skb), NULL);
1499         ip_send_check(iph);
1500
1501         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1502         nf_reset(skb);
1503 }
1504
1505 static inline int ipmr_forward_finish(struct sk_buff *skb)
1506 {
1507         struct ip_options * opt = &(IPCB(skb)->opt);
1508
1509         IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1510
1511         if (unlikely(opt->optlen))
1512                 ip_forward_options(skb);
1513
1514         return dst_output(skb);
1515 }
1516
1517 /*
1518  *      Processing handlers for ipmr_forward
1519  */
1520
1521 static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1522                             struct sk_buff *skb, struct mfc_cache *c, int vifi)
1523 {
1524         const struct iphdr *iph = ip_hdr(skb);
1525         struct vif_device *vif = &mrt->vif_table[vifi];
1526         struct net_device *dev;
1527         struct rtable *rt;
1528         int    encap = 0;
1529
1530         if (vif->dev == NULL)
1531                 goto out_free;
1532
1533 #ifdef CONFIG_IP_PIMSM
1534         if (vif->flags & VIFF_REGISTER) {
1535                 vif->pkt_out++;
1536                 vif->bytes_out += skb->len;
1537                 vif->dev->stats.tx_bytes += skb->len;
1538                 vif->dev->stats.tx_packets++;
1539                 ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1540                 goto out_free;
1541         }
1542 #endif
1543
1544         if (vif->flags&VIFF_TUNNEL) {
1545                 struct flowi fl = { .oif = vif->link,
1546                                     .nl_u = { .ip4_u =
1547                                               { .daddr = vif->remote,
1548                                                 .saddr = vif->local,
1549                                                 .tos = RT_TOS(iph->tos) } },
1550                                     .proto = IPPROTO_IPIP };
1551                 if (ip_route_output_key(net, &rt, &fl))
1552                         goto out_free;
1553                 encap = sizeof(struct iphdr);
1554         } else {
1555                 struct flowi fl = { .oif = vif->link,
1556                                     .nl_u = { .ip4_u =
1557                                               { .daddr = iph->daddr,
1558                                                 .tos = RT_TOS(iph->tos) } },
1559                                     .proto = IPPROTO_IPIP };
1560                 if (ip_route_output_key(net, &rt, &fl))
1561                         goto out_free;
1562         }
1563
1564         dev = rt->dst.dev;
1565
1566         if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1567                 /* Do not fragment multicasts. Alas, IPv4 does not
1568                    allow to send ICMP, so that packets will disappear
1569                    to blackhole.
1570                  */
1571
1572                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1573                 ip_rt_put(rt);
1574                 goto out_free;
1575         }
1576
1577         encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
1578
1579         if (skb_cow(skb, encap)) {
1580                 ip_rt_put(rt);
1581                 goto out_free;
1582         }
1583
1584         vif->pkt_out++;
1585         vif->bytes_out += skb->len;
1586
1587         skb_dst_drop(skb);
1588         skb_dst_set(skb, &rt->dst);
1589         ip_decrease_ttl(ip_hdr(skb));
1590
1591         /* FIXME: forward and output firewalls used to be called here.
1592          * What do we do with netfilter? -- RR */
1593         if (vif->flags & VIFF_TUNNEL) {
1594                 ip_encap(skb, vif->local, vif->remote);
1595                 /* FIXME: extra output firewall step used to be here. --RR */
1596                 vif->dev->stats.tx_packets++;
1597                 vif->dev->stats.tx_bytes += skb->len;
1598         }
1599
1600         IPCB(skb)->flags |= IPSKB_FORWARDED;
1601
1602         /*
1603          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1604          * not only before forwarding, but after forwarding on all output
1605          * interfaces. It is clear, if mrouter runs a multicasting
1606          * program, it should receive packets not depending to what interface
1607          * program is joined.
1608          * If we will not make it, the program will have to join on all
1609          * interfaces. On the other hand, multihoming host (or router, but
1610          * not mrouter) cannot join to more than one interface - it will
1611          * result in receiving multiple packets.
1612          */
1613         NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
1614                 ipmr_forward_finish);
1615         return;
1616
1617 out_free:
1618         kfree_skb(skb);
1619 }
1620
1621 static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1622 {
1623         int ct;
1624
1625         for (ct = mrt->maxvif-1; ct >= 0; ct--) {
1626                 if (mrt->vif_table[ct].dev == dev)
1627                         break;
1628         }
1629         return ct;
1630 }
1631
1632 /* "local" means that we should preserve one skb (for local delivery) */
1633
1634 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1635                          struct sk_buff *skb, struct mfc_cache *cache,
1636                          int local)
1637 {
1638         int psend = -1;
1639         int vif, ct;
1640
1641         vif = cache->mfc_parent;
1642         cache->mfc_un.res.pkt++;
1643         cache->mfc_un.res.bytes += skb->len;
1644
1645         /*
1646          * Wrong interface: drop packet and (maybe) send PIM assert.
1647          */
1648         if (mrt->vif_table[vif].dev != skb->dev) {
1649                 int true_vifi;
1650
1651                 if (skb_rtable(skb)->fl.iif == 0) {
1652                         /* It is our own packet, looped back.
1653                            Very complicated situation...
1654
1655                            The best workaround until routing daemons will be
1656                            fixed is not to redistribute packet, if it was
1657                            send through wrong interface. It means, that
1658                            multicast applications WILL NOT work for
1659                            (S,G), which have default multicast route pointing
1660                            to wrong oif. In any case, it is not a good
1661                            idea to use multicasting applications on router.
1662                          */
1663                         goto dont_forward;
1664                 }
1665
1666                 cache->mfc_un.res.wrong_if++;
1667                 true_vifi = ipmr_find_vif(mrt, skb->dev);
1668
1669                 if (true_vifi >= 0 && mrt->mroute_do_assert &&
1670                     /* pimsm uses asserts, when switching from RPT to SPT,
1671                        so that we cannot check that packet arrived on an oif.
1672                        It is bad, but otherwise we would need to move pretty
1673                        large chunk of pimd to kernel. Ough... --ANK
1674                      */
1675                     (mrt->mroute_do_pim ||
1676                      cache->mfc_un.res.ttls[true_vifi] < 255) &&
1677                     time_after(jiffies,
1678                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1679                         cache->mfc_un.res.last_assert = jiffies;
1680                         ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1681                 }
1682                 goto dont_forward;
1683         }
1684
1685         mrt->vif_table[vif].pkt_in++;
1686         mrt->vif_table[vif].bytes_in += skb->len;
1687
1688         /*
1689          *      Forward the frame
1690          */
1691         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1692                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1693                         if (psend != -1) {
1694                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1695                                 if (skb2)
1696                                         ipmr_queue_xmit(net, mrt, skb2, cache,
1697                                                         psend);
1698                         }
1699                         psend = ct;
1700                 }
1701         }
1702         if (psend != -1) {
1703                 if (local) {
1704                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1705                         if (skb2)
1706                                 ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1707                 } else {
1708                         ipmr_queue_xmit(net, mrt, skb, cache, psend);
1709                         return 0;
1710                 }
1711         }
1712
1713 dont_forward:
1714         if (!local)
1715                 kfree_skb(skb);
1716         return 0;
1717 }
1718
1719
1720 /*
1721  *      Multicast packets for forwarding arrive here
1722  *      Called with rcu_read_lock();
1723  */
1724
1725 int ip_mr_input(struct sk_buff *skb)
1726 {
1727         struct mfc_cache *cache;
1728         struct net *net = dev_net(skb->dev);
1729         int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1730         struct mr_table *mrt;
1731         int err;
1732
1733         /* Packet is looped back after forward, it should not be
1734            forwarded second time, but still can be delivered locally.
1735          */
1736         if (IPCB(skb)->flags & IPSKB_FORWARDED)
1737                 goto dont_forward;
1738
1739         err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
1740         if (err < 0) {
1741                 kfree_skb(skb);
1742                 return err;
1743         }
1744
1745         if (!local) {
1746                 if (IPCB(skb)->opt.router_alert) {
1747                         if (ip_call_ra_chain(skb))
1748                                 return 0;
1749                 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
1750                         /* IGMPv1 (and broken IGMPv2 implementations sort of
1751                          * Cisco IOS <= 11.2(8)) do not put router alert
1752                          * option to IGMP packets destined to routable
1753                          * groups. It is very bad, because it means
1754                          * that we can forward NO IGMP messages.
1755                          */
1756                         struct sock *mroute_sk;
1757
1758                         mroute_sk = rcu_dereference(mrt->mroute_sk);
1759                         if (mroute_sk) {
1760                                 nf_reset(skb);
1761                                 raw_rcv(mroute_sk, skb);
1762                                 return 0;
1763                         }
1764                     }
1765         }
1766
1767         read_lock(&mrt_lock);
1768         cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1769
1770         /*
1771          *      No usable cache entry
1772          */
1773         if (cache == NULL) {
1774                 int vif;
1775
1776                 if (local) {
1777                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1778                         ip_local_deliver(skb);
1779                         if (skb2 == NULL) {
1780                                 read_unlock(&mrt_lock);
1781                                 return -ENOBUFS;
1782                         }
1783                         skb = skb2;
1784                 }
1785
1786                 vif = ipmr_find_vif(mrt, skb->dev);
1787                 if (vif >= 0) {
1788                         int err2 = ipmr_cache_unresolved(mrt, vif, skb);
1789                         read_unlock(&mrt_lock);
1790
1791                         return err2;
1792                 }
1793                 read_unlock(&mrt_lock);
1794                 kfree_skb(skb);
1795                 return -ENODEV;
1796         }
1797
1798         ip_mr_forward(net, mrt, skb, cache, local);
1799
1800         read_unlock(&mrt_lock);
1801
1802         if (local)
1803                 return ip_local_deliver(skb);
1804
1805         return 0;
1806
1807 dont_forward:
1808         if (local)
1809                 return ip_local_deliver(skb);
1810         kfree_skb(skb);
1811         return 0;
1812 }
1813
1814 #ifdef CONFIG_IP_PIMSM
1815 /* called with rcu_read_lock() */
1816 static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1817                      unsigned int pimlen)
1818 {
1819         struct net_device *reg_dev = NULL;
1820         struct iphdr *encap;
1821
1822         encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1823         /*
1824            Check that:
1825            a. packet is really destinted to a multicast group
1826            b. packet is not a NULL-REGISTER
1827            c. packet is not truncated
1828          */
1829         if (!ipv4_is_multicast(encap->daddr) ||
1830             encap->tot_len == 0 ||
1831             ntohs(encap->tot_len) + pimlen > skb->len)
1832                 return 1;
1833
1834         read_lock(&mrt_lock);
1835         if (mrt->mroute_reg_vif_num >= 0)
1836                 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1837         read_unlock(&mrt_lock);
1838
1839         if (reg_dev == NULL)
1840                 return 1;
1841
1842         skb->mac_header = skb->network_header;
1843         skb_pull(skb, (u8 *)encap - skb->data);
1844         skb_reset_network_header(skb);
1845         skb->protocol = htons(ETH_P_IP);
1846         skb->ip_summed = CHECKSUM_NONE;
1847         skb->pkt_type = PACKET_HOST;
1848
1849         skb_tunnel_rx(skb, reg_dev);
1850
1851         netif_rx(skb);
1852
1853         return NET_RX_SUCCESS;
1854 }
1855 #endif
1856
1857 #ifdef CONFIG_IP_PIMSM_V1
1858 /*
1859  * Handle IGMP messages of PIMv1
1860  */
1861
1862 int pim_rcv_v1(struct sk_buff * skb)
1863 {
1864         struct igmphdr *pim;
1865         struct net *net = dev_net(skb->dev);
1866         struct mr_table *mrt;
1867
1868         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1869                 goto drop;
1870
1871         pim = igmp_hdr(skb);
1872
1873         if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1874                 goto drop;
1875
1876         if (!mrt->mroute_do_pim ||
1877             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1878                 goto drop;
1879
1880         if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1881 drop:
1882                 kfree_skb(skb);
1883         }
1884         return 0;
1885 }
1886 #endif
1887
1888 #ifdef CONFIG_IP_PIMSM_V2
1889 static int pim_rcv(struct sk_buff * skb)
1890 {
1891         struct pimreghdr *pim;
1892         struct net *net = dev_net(skb->dev);
1893         struct mr_table *mrt;
1894
1895         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1896                 goto drop;
1897
1898         pim = (struct pimreghdr *)skb_transport_header(skb);
1899         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1900             (pim->flags&PIM_NULL_REGISTER) ||
1901             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1902              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1903                 goto drop;
1904
1905         if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1906                 goto drop;
1907
1908         if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1909 drop:
1910                 kfree_skb(skb);
1911         }
1912         return 0;
1913 }
1914 #endif
1915
1916 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
1917                               struct mfc_cache *c, struct rtmsg *rtm)
1918 {
1919         int ct;
1920         struct rtnexthop *nhp;
1921         u8 *b = skb_tail_pointer(skb);
1922         struct rtattr *mp_head;
1923
1924         /* If cache is unresolved, don't try to parse IIF and OIF */
1925         if (c->mfc_parent >= MAXVIFS)
1926                 return -ENOENT;
1927
1928         if (VIF_EXISTS(mrt, c->mfc_parent))
1929                 RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
1930
1931         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1932
1933         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1934                 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
1935                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1936                                 goto rtattr_failure;
1937                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1938                         nhp->rtnh_flags = 0;
1939                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1940                         nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
1941                         nhp->rtnh_len = sizeof(*nhp);
1942                 }
1943         }
1944         mp_head->rta_type = RTA_MULTIPATH;
1945         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1946         rtm->rtm_type = RTN_MULTICAST;
1947         return 1;
1948
1949 rtattr_failure:
1950         nlmsg_trim(skb, b);
1951         return -EMSGSIZE;
1952 }
1953
1954 int ipmr_get_route(struct net *net,
1955                    struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1956 {
1957         int err;
1958         struct mr_table *mrt;
1959         struct mfc_cache *cache;
1960         struct rtable *rt = skb_rtable(skb);
1961
1962         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
1963         if (mrt == NULL)
1964                 return -ENOENT;
1965
1966         read_lock(&mrt_lock);
1967         cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1968
1969         if (cache == NULL) {
1970                 struct sk_buff *skb2;
1971                 struct iphdr *iph;
1972                 struct net_device *dev;
1973                 int vif;
1974
1975                 if (nowait) {
1976                         read_unlock(&mrt_lock);
1977                         return -EAGAIN;
1978                 }
1979
1980                 dev = skb->dev;
1981                 if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
1982                         read_unlock(&mrt_lock);
1983                         return -ENODEV;
1984                 }
1985                 skb2 = skb_clone(skb, GFP_ATOMIC);
1986                 if (!skb2) {
1987                         read_unlock(&mrt_lock);
1988                         return -ENOMEM;
1989                 }
1990
1991                 skb_push(skb2, sizeof(struct iphdr));
1992                 skb_reset_network_header(skb2);
1993                 iph = ip_hdr(skb2);
1994                 iph->ihl = sizeof(struct iphdr) >> 2;
1995                 iph->saddr = rt->rt_src;
1996                 iph->daddr = rt->rt_dst;
1997                 iph->version = 0;
1998                 err = ipmr_cache_unresolved(mrt, vif, skb2);
1999                 read_unlock(&mrt_lock);
2000                 return err;
2001         }
2002
2003         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
2004                 cache->mfc_flags |= MFC_NOTIFY;
2005         err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
2006         read_unlock(&mrt_lock);
2007         return err;
2008 }
2009
2010 static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2011                             u32 pid, u32 seq, struct mfc_cache *c)
2012 {
2013         struct nlmsghdr *nlh;
2014         struct rtmsg *rtm;
2015
2016         nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2017         if (nlh == NULL)
2018                 return -EMSGSIZE;
2019
2020         rtm = nlmsg_data(nlh);
2021         rtm->rtm_family   = RTNL_FAMILY_IPMR;
2022         rtm->rtm_dst_len  = 32;
2023         rtm->rtm_src_len  = 32;
2024         rtm->rtm_tos      = 0;
2025         rtm->rtm_table    = mrt->id;
2026         NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
2027         rtm->rtm_type     = RTN_MULTICAST;
2028         rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2029         rtm->rtm_protocol = RTPROT_UNSPEC;
2030         rtm->rtm_flags    = 0;
2031
2032         NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
2033         NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
2034
2035         if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2036                 goto nla_put_failure;
2037
2038         return nlmsg_end(skb, nlh);
2039
2040 nla_put_failure:
2041         nlmsg_cancel(skb, nlh);
2042         return -EMSGSIZE;
2043 }
2044
2045 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2046 {
2047         struct net *net = sock_net(skb->sk);
2048         struct mr_table *mrt;
2049         struct mfc_cache *mfc;
2050         unsigned int t = 0, s_t;
2051         unsigned int h = 0, s_h;
2052         unsigned int e = 0, s_e;
2053
2054         s_t = cb->args[0];
2055         s_h = cb->args[1];
2056         s_e = cb->args[2];
2057
2058         read_lock(&mrt_lock);
2059         ipmr_for_each_table(mrt, net) {
2060                 if (t < s_t)
2061                         goto next_table;
2062                 if (t > s_t)
2063                         s_h = 0;
2064                 for (h = s_h; h < MFC_LINES; h++) {
2065                         list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
2066                                 if (e < s_e)
2067                                         goto next_entry;
2068                                 if (ipmr_fill_mroute(mrt, skb,
2069                                                      NETLINK_CB(cb->skb).pid,
2070                                                      cb->nlh->nlmsg_seq,
2071                                                      mfc) < 0)
2072                                         goto done;
2073 next_entry:
2074                                 e++;
2075                         }
2076                         e = s_e = 0;
2077                 }
2078                 s_h = 0;
2079 next_table:
2080                 t++;
2081         }
2082 done:
2083         read_unlock(&mrt_lock);
2084
2085         cb->args[2] = e;
2086         cb->args[1] = h;
2087         cb->args[0] = t;
2088
2089         return skb->len;
2090 }
2091
2092 #ifdef CONFIG_PROC_FS
2093 /*
2094  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
2095  */
2096 struct ipmr_vif_iter {
2097         struct seq_net_private p;
2098         struct mr_table *mrt;
2099         int ct;
2100 };
2101
2102 static struct vif_device *ipmr_vif_seq_idx(struct net *net,
2103                                            struct ipmr_vif_iter *iter,
2104                                            loff_t pos)
2105 {
2106         struct mr_table *mrt = iter->mrt;
2107
2108         for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
2109                 if (!VIF_EXISTS(mrt, iter->ct))
2110                         continue;
2111                 if (pos-- == 0)
2112                         return &mrt->vif_table[iter->ct];
2113         }
2114         return NULL;
2115 }
2116
2117 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
2118         __acquires(mrt_lock)
2119 {
2120         struct ipmr_vif_iter *iter = seq->private;
2121         struct net *net = seq_file_net(seq);
2122         struct mr_table *mrt;
2123
2124         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2125         if (mrt == NULL)
2126                 return ERR_PTR(-ENOENT);
2127
2128         iter->mrt = mrt;
2129
2130         read_lock(&mrt_lock);
2131         return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
2132                 : SEQ_START_TOKEN;
2133 }
2134
2135 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2136 {
2137         struct ipmr_vif_iter *iter = seq->private;
2138         struct net *net = seq_file_net(seq);
2139         struct mr_table *mrt = iter->mrt;
2140
2141         ++*pos;
2142         if (v == SEQ_START_TOKEN)
2143                 return ipmr_vif_seq_idx(net, iter, 0);
2144
2145         while (++iter->ct < mrt->maxvif) {
2146                 if (!VIF_EXISTS(mrt, iter->ct))
2147                         continue;
2148                 return &mrt->vif_table[iter->ct];
2149         }
2150         return NULL;
2151 }
2152
2153 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
2154         __releases(mrt_lock)
2155 {
2156         read_unlock(&mrt_lock);
2157 }
2158
2159 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
2160 {
2161         struct ipmr_vif_iter *iter = seq->private;
2162         struct mr_table *mrt = iter->mrt;
2163
2164         if (v == SEQ_START_TOKEN) {
2165                 seq_puts(seq,
2166                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
2167         } else {
2168                 const struct vif_device *vif = v;
2169                 const char *name =  vif->dev ? vif->dev->name : "none";
2170
2171                 seq_printf(seq,
2172                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2173                            vif - mrt->vif_table,
2174                            name, vif->bytes_in, vif->pkt_in,
2175                            vif->bytes_out, vif->pkt_out,
2176                            vif->flags, vif->local, vif->remote);
2177         }
2178         return 0;
2179 }
2180
2181 static const struct seq_operations ipmr_vif_seq_ops = {
2182         .start = ipmr_vif_seq_start,
2183         .next  = ipmr_vif_seq_next,
2184         .stop  = ipmr_vif_seq_stop,
2185         .show  = ipmr_vif_seq_show,
2186 };
2187
2188 static int ipmr_vif_open(struct inode *inode, struct file *file)
2189 {
2190         return seq_open_net(inode, file, &ipmr_vif_seq_ops,
2191                             sizeof(struct ipmr_vif_iter));
2192 }
2193
2194 static const struct file_operations ipmr_vif_fops = {
2195         .owner   = THIS_MODULE,
2196         .open    = ipmr_vif_open,
2197         .read    = seq_read,
2198         .llseek  = seq_lseek,
2199         .release = seq_release_net,
2200 };
2201
2202 struct ipmr_mfc_iter {
2203         struct seq_net_private p;
2204         struct mr_table *mrt;
2205         struct list_head *cache;
2206         int ct;
2207 };
2208
2209
2210 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2211                                           struct ipmr_mfc_iter *it, loff_t pos)
2212 {
2213         struct mr_table *mrt = it->mrt;
2214         struct mfc_cache *mfc;
2215
2216         read_lock(&mrt_lock);
2217         for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2218                 it->cache = &mrt->mfc_cache_array[it->ct];
2219                 list_for_each_entry(mfc, it->cache, list)
2220                         if (pos-- == 0)
2221                                 return mfc;
2222         }
2223         read_unlock(&mrt_lock);
2224
2225         spin_lock_bh(&mfc_unres_lock);
2226         it->cache = &mrt->mfc_unres_queue;
2227         list_for_each_entry(mfc, it->cache, list)
2228                 if (pos-- == 0)
2229                         return mfc;
2230         spin_unlock_bh(&mfc_unres_lock);
2231
2232         it->cache = NULL;
2233         return NULL;
2234 }
2235
2236
2237 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2238 {
2239         struct ipmr_mfc_iter *it = seq->private;
2240         struct net *net = seq_file_net(seq);
2241         struct mr_table *mrt;
2242
2243         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2244         if (mrt == NULL)
2245                 return ERR_PTR(-ENOENT);
2246
2247         it->mrt = mrt;
2248         it->cache = NULL;
2249         it->ct = 0;
2250         return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
2251                 : SEQ_START_TOKEN;
2252 }
2253
2254 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2255 {
2256         struct mfc_cache *mfc = v;
2257         struct ipmr_mfc_iter *it = seq->private;
2258         struct net *net = seq_file_net(seq);
2259         struct mr_table *mrt = it->mrt;
2260
2261         ++*pos;
2262
2263         if (v == SEQ_START_TOKEN)
2264                 return ipmr_mfc_seq_idx(net, seq->private, 0);
2265
2266         if (mfc->list.next != it->cache)
2267                 return list_entry(mfc->list.next, struct mfc_cache, list);
2268
2269         if (it->cache == &mrt->mfc_unres_queue)
2270                 goto end_of_list;
2271
2272         BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
2273
2274         while (++it->ct < MFC_LINES) {
2275                 it->cache = &mrt->mfc_cache_array[it->ct];
2276                 if (list_empty(it->cache))
2277                         continue;
2278                 return list_first_entry(it->cache, struct mfc_cache, list);
2279         }
2280
2281         /* exhausted cache_array, show unresolved */
2282         read_unlock(&mrt_lock);
2283         it->cache = &mrt->mfc_unres_queue;
2284         it->ct = 0;
2285
2286         spin_lock_bh(&mfc_unres_lock);
2287         if (!list_empty(it->cache))
2288                 return list_first_entry(it->cache, struct mfc_cache, list);
2289
2290  end_of_list:
2291         spin_unlock_bh(&mfc_unres_lock);
2292         it->cache = NULL;
2293
2294         return NULL;
2295 }
2296
2297 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2298 {
2299         struct ipmr_mfc_iter *it = seq->private;
2300         struct mr_table *mrt = it->mrt;
2301
2302         if (it->cache == &mrt->mfc_unres_queue)
2303                 spin_unlock_bh(&mfc_unres_lock);
2304         else if (it->cache == &mrt->mfc_cache_array[it->ct])
2305                 read_unlock(&mrt_lock);
2306 }
2307
2308 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2309 {
2310         int n;
2311
2312         if (v == SEQ_START_TOKEN) {
2313                 seq_puts(seq,
2314                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
2315         } else {
2316                 const struct mfc_cache *mfc = v;
2317                 const struct ipmr_mfc_iter *it = seq->private;
2318                 const struct mr_table *mrt = it->mrt;
2319
2320                 seq_printf(seq, "%08X %08X %-3hd",
2321                            (__force u32) mfc->mfc_mcastgrp,
2322                            (__force u32) mfc->mfc_origin,
2323                            mfc->mfc_parent);
2324
2325                 if (it->cache != &mrt->mfc_unres_queue) {
2326                         seq_printf(seq, " %8lu %8lu %8lu",
2327                                    mfc->mfc_un.res.pkt,
2328                                    mfc->mfc_un.res.bytes,
2329                                    mfc->mfc_un.res.wrong_if);
2330                         for (n = mfc->mfc_un.res.minvif;
2331                              n < mfc->mfc_un.res.maxvif; n++ ) {
2332                                 if (VIF_EXISTS(mrt, n) &&
2333                                     mfc->mfc_un.res.ttls[n] < 255)
2334                                         seq_printf(seq,
2335                                            " %2d:%-3d",
2336                                            n, mfc->mfc_un.res.ttls[n]);
2337                         }
2338                 } else {
2339                         /* unresolved mfc_caches don't contain
2340                          * pkt, bytes and wrong_if values
2341                          */
2342                         seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
2343                 }
2344                 seq_putc(seq, '\n');
2345         }
2346         return 0;
2347 }
2348
2349 static const struct seq_operations ipmr_mfc_seq_ops = {
2350         .start = ipmr_mfc_seq_start,
2351         .next  = ipmr_mfc_seq_next,
2352         .stop  = ipmr_mfc_seq_stop,
2353         .show  = ipmr_mfc_seq_show,
2354 };
2355
2356 static int ipmr_mfc_open(struct inode *inode, struct file *file)
2357 {
2358         return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
2359                             sizeof(struct ipmr_mfc_iter));
2360 }
2361
2362 static const struct file_operations ipmr_mfc_fops = {
2363         .owner   = THIS_MODULE,
2364         .open    = ipmr_mfc_open,
2365         .read    = seq_read,
2366         .llseek  = seq_lseek,
2367         .release = seq_release_net,
2368 };
2369 #endif
2370
2371 #ifdef CONFIG_IP_PIMSM_V2
2372 static const struct net_protocol pim_protocol = {
2373         .handler        =       pim_rcv,
2374         .netns_ok       =       1,
2375 };
2376 #endif
2377
2378
2379 /*
2380  *      Setup for IP multicast routing
2381  */
2382 static int __net_init ipmr_net_init(struct net *net)
2383 {
2384         int err;
2385
2386         err = ipmr_rules_init(net);
2387         if (err < 0)
2388                 goto fail;
2389
2390 #ifdef CONFIG_PROC_FS
2391         err = -ENOMEM;
2392         if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
2393                 goto proc_vif_fail;
2394         if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
2395                 goto proc_cache_fail;
2396 #endif
2397         return 0;
2398
2399 #ifdef CONFIG_PROC_FS
2400 proc_cache_fail:
2401         proc_net_remove(net, "ip_mr_vif");
2402 proc_vif_fail:
2403         ipmr_rules_exit(net);
2404 #endif
2405 fail:
2406         return err;
2407 }
2408
2409 static void __net_exit ipmr_net_exit(struct net *net)
2410 {
2411 #ifdef CONFIG_PROC_FS
2412         proc_net_remove(net, "ip_mr_cache");
2413         proc_net_remove(net, "ip_mr_vif");
2414 #endif
2415         ipmr_rules_exit(net);
2416 }
2417
2418 static struct pernet_operations ipmr_net_ops = {
2419         .init = ipmr_net_init,
2420         .exit = ipmr_net_exit,
2421 };
2422
2423 int __init ip_mr_init(void)
2424 {
2425         int err;
2426
2427         mrt_cachep = kmem_cache_create("ip_mrt_cache",
2428                                        sizeof(struct mfc_cache),
2429                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2430                                        NULL);
2431         if (!mrt_cachep)
2432                 return -ENOMEM;
2433
2434         err = register_pernet_subsys(&ipmr_net_ops);
2435         if (err)
2436                 goto reg_pernet_fail;
2437
2438         err = register_netdevice_notifier(&ip_mr_notifier);
2439         if (err)
2440                 goto reg_notif_fail;
2441 #ifdef CONFIG_IP_PIMSM_V2
2442         if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
2443                 printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
2444                 err = -EAGAIN;
2445                 goto add_proto_fail;
2446         }
2447 #endif
2448         rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
2449         return 0;
2450
2451 #ifdef CONFIG_IP_PIMSM_V2
2452 add_proto_fail:
2453         unregister_netdevice_notifier(&ip_mr_notifier);
2454 #endif
2455 reg_notif_fail:
2456         unregister_pernet_subsys(&ipmr_net_ops);
2457 reg_pernet_fail:
2458         kmem_cache_destroy(mrt_cachep);
2459         return err;
2460 }