netns: ipmr: declare reg_vif_num per-namespace
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requrement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 /* Big lock, protecting vif table, mrt cache and mroute socket state.
71    Note that the changes are semaphored via rtnl_lock.
72  */
73
74 static DEFINE_RWLOCK(mrt_lock);
75
76 /*
77  *      Multicast router control variables
78  */
79
80 #define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
81
82 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
83
84 /* Special spinlock for queue of unresolved entries */
85 static DEFINE_SPINLOCK(mfc_unres_lock);
86
87 /* We return to original Alan's scheme. Hash table of resolved
88    entries is changed only in process context and protected
89    with weak lock mrt_lock. Queue of unresolved entries is protected
90    with strong spinlock mfc_unres_lock.
91
92    In this case data path is free of exclusive locks at all.
93  */
94
95 static struct kmem_cache *mrt_cachep __read_mostly;
96
97 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
98 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
99 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
100
101 #ifdef CONFIG_IP_PIMSM_V2
102 static struct net_protocol pim_protocol;
103 #endif
104
105 static struct timer_list ipmr_expire_timer;
106
107 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
108
109 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
110 {
111         dev_close(dev);
112
113         dev = __dev_get_by_name(&init_net, "tunl0");
114         if (dev) {
115                 const struct net_device_ops *ops = dev->netdev_ops;
116                 struct ifreq ifr;
117                 struct ip_tunnel_parm p;
118
119                 memset(&p, 0, sizeof(p));
120                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
121                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
122                 p.iph.version = 4;
123                 p.iph.ihl = 5;
124                 p.iph.protocol = IPPROTO_IPIP;
125                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
126                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
127
128                 if (ops->ndo_do_ioctl) {
129                         mm_segment_t oldfs = get_fs();
130
131                         set_fs(KERNEL_DS);
132                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
133                         set_fs(oldfs);
134                 }
135         }
136 }
137
138 static
139 struct net_device *ipmr_new_tunnel(struct vifctl *v)
140 {
141         struct net_device  *dev;
142
143         dev = __dev_get_by_name(&init_net, "tunl0");
144
145         if (dev) {
146                 const struct net_device_ops *ops = dev->netdev_ops;
147                 int err;
148                 struct ifreq ifr;
149                 struct ip_tunnel_parm p;
150                 struct in_device  *in_dev;
151
152                 memset(&p, 0, sizeof(p));
153                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
154                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
155                 p.iph.version = 4;
156                 p.iph.ihl = 5;
157                 p.iph.protocol = IPPROTO_IPIP;
158                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
159                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
160
161                 if (ops->ndo_do_ioctl) {
162                         mm_segment_t oldfs = get_fs();
163
164                         set_fs(KERNEL_DS);
165                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
166                         set_fs(oldfs);
167                 } else
168                         err = -EOPNOTSUPP;
169
170                 dev = NULL;
171
172                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
173                         dev->flags |= IFF_MULTICAST;
174
175                         in_dev = __in_dev_get_rtnl(dev);
176                         if (in_dev == NULL)
177                                 goto failure;
178
179                         ipv4_devconf_setall(in_dev);
180                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
181
182                         if (dev_open(dev))
183                                 goto failure;
184                         dev_hold(dev);
185                 }
186         }
187         return dev;
188
189 failure:
190         /* allow the register to be completed before unregistering. */
191         rtnl_unlock();
192         rtnl_lock();
193
194         unregister_netdevice(dev);
195         return NULL;
196 }
197
198 #ifdef CONFIG_IP_PIMSM
199
200 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
201 {
202         read_lock(&mrt_lock);
203         dev->stats.tx_bytes += skb->len;
204         dev->stats.tx_packets++;
205         ipmr_cache_report(skb, init_net.ipv4.mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
206         read_unlock(&mrt_lock);
207         kfree_skb(skb);
208         return 0;
209 }
210
211 static const struct net_device_ops reg_vif_netdev_ops = {
212         .ndo_start_xmit = reg_vif_xmit,
213 };
214
215 static void reg_vif_setup(struct net_device *dev)
216 {
217         dev->type               = ARPHRD_PIMREG;
218         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
219         dev->flags              = IFF_NOARP;
220         dev->netdev_ops         = &reg_vif_netdev_ops,
221         dev->destructor         = free_netdev;
222 }
223
224 static struct net_device *ipmr_reg_vif(void)
225 {
226         struct net_device *dev;
227         struct in_device *in_dev;
228
229         dev = alloc_netdev(0, "pimreg", reg_vif_setup);
230
231         if (dev == NULL)
232                 return NULL;
233
234         if (register_netdevice(dev)) {
235                 free_netdev(dev);
236                 return NULL;
237         }
238         dev->iflink = 0;
239
240         rcu_read_lock();
241         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
242                 rcu_read_unlock();
243                 goto failure;
244         }
245
246         ipv4_devconf_setall(in_dev);
247         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
248         rcu_read_unlock();
249
250         if (dev_open(dev))
251                 goto failure;
252
253         dev_hold(dev);
254
255         return dev;
256
257 failure:
258         /* allow the register to be completed before unregistering. */
259         rtnl_unlock();
260         rtnl_lock();
261
262         unregister_netdevice(dev);
263         return NULL;
264 }
265 #endif
266
267 /*
268  *      Delete a VIF entry
269  *      @notify: Set to 1, if the caller is a notifier_call
270  */
271
272 static int vif_delete(int vifi, int notify)
273 {
274         struct vif_device *v;
275         struct net_device *dev;
276         struct in_device *in_dev;
277
278         if (vifi < 0 || vifi >= init_net.ipv4.maxvif)
279                 return -EADDRNOTAVAIL;
280
281         v = &init_net.ipv4.vif_table[vifi];
282
283         write_lock_bh(&mrt_lock);
284         dev = v->dev;
285         v->dev = NULL;
286
287         if (!dev) {
288                 write_unlock_bh(&mrt_lock);
289                 return -EADDRNOTAVAIL;
290         }
291
292 #ifdef CONFIG_IP_PIMSM
293         if (vifi == init_net.ipv4.mroute_reg_vif_num)
294                 init_net.ipv4.mroute_reg_vif_num = -1;
295 #endif
296
297         if (vifi+1 == init_net.ipv4.maxvif) {
298                 int tmp;
299                 for (tmp=vifi-1; tmp>=0; tmp--) {
300                         if (VIF_EXISTS(&init_net, tmp))
301                                 break;
302                 }
303                 init_net.ipv4.maxvif = tmp+1;
304         }
305
306         write_unlock_bh(&mrt_lock);
307
308         dev_set_allmulti(dev, -1);
309
310         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
311                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
312                 ip_rt_multicast_event(in_dev);
313         }
314
315         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
316                 unregister_netdevice(dev);
317
318         dev_put(dev);
319         return 0;
320 }
321
322 static inline void ipmr_cache_free(struct mfc_cache *c)
323 {
324         release_net(mfc_net(c));
325         kmem_cache_free(mrt_cachep, c);
326 }
327
328 /* Destroy an unresolved cache entry, killing queued skbs
329    and reporting error to netlink readers.
330  */
331
332 static void ipmr_destroy_unres(struct mfc_cache *c)
333 {
334         struct sk_buff *skb;
335         struct nlmsgerr *e;
336
337         atomic_dec(&init_net.ipv4.cache_resolve_queue_len);
338
339         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
340                 if (ip_hdr(skb)->version == 0) {
341                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
342                         nlh->nlmsg_type = NLMSG_ERROR;
343                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
344                         skb_trim(skb, nlh->nlmsg_len);
345                         e = NLMSG_DATA(nlh);
346                         e->error = -ETIMEDOUT;
347                         memset(&e->msg, 0, sizeof(e->msg));
348
349                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
350                 } else
351                         kfree_skb(skb);
352         }
353
354         ipmr_cache_free(c);
355 }
356
357
358 /* Single timer process for all the unresolved queue. */
359
360 static void ipmr_expire_process(unsigned long dummy)
361 {
362         unsigned long now;
363         unsigned long expires;
364         struct mfc_cache *c, **cp;
365
366         if (!spin_trylock(&mfc_unres_lock)) {
367                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
368                 return;
369         }
370
371         if (mfc_unres_queue == NULL)
372                 goto out;
373
374         now = jiffies;
375         expires = 10*HZ;
376         cp = &mfc_unres_queue;
377
378         while ((c=*cp) != NULL) {
379                 if (time_after(c->mfc_un.unres.expires, now)) {
380                         unsigned long interval = c->mfc_un.unres.expires - now;
381                         if (interval < expires)
382                                 expires = interval;
383                         cp = &c->next;
384                         continue;
385                 }
386
387                 *cp = c->next;
388
389                 ipmr_destroy_unres(c);
390         }
391
392         if (mfc_unres_queue != NULL)
393                 mod_timer(&ipmr_expire_timer, jiffies + expires);
394
395 out:
396         spin_unlock(&mfc_unres_lock);
397 }
398
399 /* Fill oifs list. It is called under write locked mrt_lock. */
400
401 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
402 {
403         int vifi;
404
405         cache->mfc_un.res.minvif = MAXVIFS;
406         cache->mfc_un.res.maxvif = 0;
407         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
408
409         for (vifi = 0; vifi < init_net.ipv4.maxvif; vifi++) {
410                 if (VIF_EXISTS(&init_net, vifi) &&
411                     ttls[vifi] && ttls[vifi] < 255) {
412                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
413                         if (cache->mfc_un.res.minvif > vifi)
414                                 cache->mfc_un.res.minvif = vifi;
415                         if (cache->mfc_un.res.maxvif <= vifi)
416                                 cache->mfc_un.res.maxvif = vifi + 1;
417                 }
418         }
419 }
420
421 static int vif_add(struct vifctl *vifc, int mrtsock)
422 {
423         int vifi = vifc->vifc_vifi;
424         struct vif_device *v = &init_net.ipv4.vif_table[vifi];
425         struct net_device *dev;
426         struct in_device *in_dev;
427         int err;
428
429         /* Is vif busy ? */
430         if (VIF_EXISTS(&init_net, vifi))
431                 return -EADDRINUSE;
432
433         switch (vifc->vifc_flags) {
434 #ifdef CONFIG_IP_PIMSM
435         case VIFF_REGISTER:
436                 /*
437                  * Special Purpose VIF in PIM
438                  * All the packets will be sent to the daemon
439                  */
440                 if (init_net.ipv4.mroute_reg_vif_num >= 0)
441                         return -EADDRINUSE;
442                 dev = ipmr_reg_vif();
443                 if (!dev)
444                         return -ENOBUFS;
445                 err = dev_set_allmulti(dev, 1);
446                 if (err) {
447                         unregister_netdevice(dev);
448                         dev_put(dev);
449                         return err;
450                 }
451                 break;
452 #endif
453         case VIFF_TUNNEL:
454                 dev = ipmr_new_tunnel(vifc);
455                 if (!dev)
456                         return -ENOBUFS;
457                 err = dev_set_allmulti(dev, 1);
458                 if (err) {
459                         ipmr_del_tunnel(dev, vifc);
460                         dev_put(dev);
461                         return err;
462                 }
463                 break;
464         case 0:
465                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
466                 if (!dev)
467                         return -EADDRNOTAVAIL;
468                 err = dev_set_allmulti(dev, 1);
469                 if (err) {
470                         dev_put(dev);
471                         return err;
472                 }
473                 break;
474         default:
475                 return -EINVAL;
476         }
477
478         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
479                 return -EADDRNOTAVAIL;
480         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
481         ip_rt_multicast_event(in_dev);
482
483         /*
484          *      Fill in the VIF structures
485          */
486         v->rate_limit = vifc->vifc_rate_limit;
487         v->local = vifc->vifc_lcl_addr.s_addr;
488         v->remote = vifc->vifc_rmt_addr.s_addr;
489         v->flags = vifc->vifc_flags;
490         if (!mrtsock)
491                 v->flags |= VIFF_STATIC;
492         v->threshold = vifc->vifc_threshold;
493         v->bytes_in = 0;
494         v->bytes_out = 0;
495         v->pkt_in = 0;
496         v->pkt_out = 0;
497         v->link = dev->ifindex;
498         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
499                 v->link = dev->iflink;
500
501         /* And finish update writing critical data */
502         write_lock_bh(&mrt_lock);
503         v->dev = dev;
504 #ifdef CONFIG_IP_PIMSM
505         if (v->flags&VIFF_REGISTER)
506                 init_net.ipv4.mroute_reg_vif_num = vifi;
507 #endif
508         if (vifi+1 > init_net.ipv4.maxvif)
509                 init_net.ipv4.maxvif = vifi+1;
510         write_unlock_bh(&mrt_lock);
511         return 0;
512 }
513
514 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
515 {
516         int line = MFC_HASH(mcastgrp, origin);
517         struct mfc_cache *c;
518
519         for (c = init_net.ipv4.mfc_cache_array[line]; c; c = c->next) {
520                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
521                         break;
522         }
523         return c;
524 }
525
526 /*
527  *      Allocate a multicast cache entry
528  */
529 static struct mfc_cache *ipmr_cache_alloc(struct net *net)
530 {
531         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
532         if (c == NULL)
533                 return NULL;
534         c->mfc_un.res.minvif = MAXVIFS;
535         mfc_net_set(c, net);
536         return c;
537 }
538
539 static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
540 {
541         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
542         if (c == NULL)
543                 return NULL;
544         skb_queue_head_init(&c->mfc_un.unres.unresolved);
545         c->mfc_un.unres.expires = jiffies + 10*HZ;
546         mfc_net_set(c, net);
547         return c;
548 }
549
550 /*
551  *      A cache entry has gone into a resolved state from queued
552  */
553
554 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
555 {
556         struct sk_buff *skb;
557         struct nlmsgerr *e;
558
559         /*
560          *      Play the pending entries through our router
561          */
562
563         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
564                 if (ip_hdr(skb)->version == 0) {
565                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
566
567                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
568                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
569                                                   (u8 *)nlh);
570                         } else {
571                                 nlh->nlmsg_type = NLMSG_ERROR;
572                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
573                                 skb_trim(skb, nlh->nlmsg_len);
574                                 e = NLMSG_DATA(nlh);
575                                 e->error = -EMSGSIZE;
576                                 memset(&e->msg, 0, sizeof(e->msg));
577                         }
578
579                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
580                 } else
581                         ip_mr_forward(skb, c, 0);
582         }
583 }
584
585 /*
586  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
587  *      expects the following bizarre scheme.
588  *
589  *      Called under mrt_lock.
590  */
591
592 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
593 {
594         struct sk_buff *skb;
595         const int ihl = ip_hdrlen(pkt);
596         struct igmphdr *igmp;
597         struct igmpmsg *msg;
598         int ret;
599
600 #ifdef CONFIG_IP_PIMSM
601         if (assert == IGMPMSG_WHOLEPKT)
602                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
603         else
604 #endif
605                 skb = alloc_skb(128, GFP_ATOMIC);
606
607         if (!skb)
608                 return -ENOBUFS;
609
610 #ifdef CONFIG_IP_PIMSM
611         if (assert == IGMPMSG_WHOLEPKT) {
612                 /* Ugly, but we have no choice with this interface.
613                    Duplicate old header, fix ihl, length etc.
614                    And all this only to mangle msg->im_msgtype and
615                    to set msg->im_mbz to "mbz" :-)
616                  */
617                 skb_push(skb, sizeof(struct iphdr));
618                 skb_reset_network_header(skb);
619                 skb_reset_transport_header(skb);
620                 msg = (struct igmpmsg *)skb_network_header(skb);
621                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
622                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
623                 msg->im_mbz = 0;
624                 msg->im_vif = init_net.ipv4.mroute_reg_vif_num;
625                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
626                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
627                                              sizeof(struct iphdr));
628         } else
629 #endif
630         {
631
632         /*
633          *      Copy the IP header
634          */
635
636         skb->network_header = skb->tail;
637         skb_put(skb, ihl);
638         skb_copy_to_linear_data(skb, pkt->data, ihl);
639         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
640         msg = (struct igmpmsg *)skb_network_header(skb);
641         msg->im_vif = vifi;
642         skb->dst = dst_clone(pkt->dst);
643
644         /*
645          *      Add our header
646          */
647
648         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
649         igmp->type      =
650         msg->im_msgtype = assert;
651         igmp->code      =       0;
652         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
653         skb->transport_header = skb->network_header;
654         }
655
656         if (init_net.ipv4.mroute_sk == NULL) {
657                 kfree_skb(skb);
658                 return -EINVAL;
659         }
660
661         /*
662          *      Deliver to mrouted
663          */
664         ret = sock_queue_rcv_skb(init_net.ipv4.mroute_sk, skb);
665         if (ret < 0) {
666                 if (net_ratelimit())
667                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
668                 kfree_skb(skb);
669         }
670
671         return ret;
672 }
673
674 /*
675  *      Queue a packet for resolution. It gets locked cache entry!
676  */
677
678 static int
679 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
680 {
681         int err;
682         struct mfc_cache *c;
683         const struct iphdr *iph = ip_hdr(skb);
684
685         spin_lock_bh(&mfc_unres_lock);
686         for (c=mfc_unres_queue; c; c=c->next) {
687                 if (net_eq(mfc_net(c), &init_net) &&
688                     c->mfc_mcastgrp == iph->daddr &&
689                     c->mfc_origin == iph->saddr)
690                         break;
691         }
692
693         if (c == NULL) {
694                 /*
695                  *      Create a new entry if allowable
696                  */
697
698                 if (atomic_read(&init_net.ipv4.cache_resolve_queue_len) >= 10 ||
699                     (c = ipmr_cache_alloc_unres(&init_net)) == NULL) {
700                         spin_unlock_bh(&mfc_unres_lock);
701
702                         kfree_skb(skb);
703                         return -ENOBUFS;
704                 }
705
706                 /*
707                  *      Fill in the new cache entry
708                  */
709                 c->mfc_parent   = -1;
710                 c->mfc_origin   = iph->saddr;
711                 c->mfc_mcastgrp = iph->daddr;
712
713                 /*
714                  *      Reflect first query at mrouted.
715                  */
716                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
717                         /* If the report failed throw the cache entry
718                            out - Brad Parker
719                          */
720                         spin_unlock_bh(&mfc_unres_lock);
721
722                         ipmr_cache_free(c);
723                         kfree_skb(skb);
724                         return err;
725                 }
726
727                 atomic_inc(&init_net.ipv4.cache_resolve_queue_len);
728                 c->next = mfc_unres_queue;
729                 mfc_unres_queue = c;
730
731                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
732         }
733
734         /*
735          *      See if we can append the packet
736          */
737         if (c->mfc_un.unres.unresolved.qlen>3) {
738                 kfree_skb(skb);
739                 err = -ENOBUFS;
740         } else {
741                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
742                 err = 0;
743         }
744
745         spin_unlock_bh(&mfc_unres_lock);
746         return err;
747 }
748
749 /*
750  *      MFC cache manipulation by user space mroute daemon
751  */
752
753 static int ipmr_mfc_delete(struct mfcctl *mfc)
754 {
755         int line;
756         struct mfc_cache *c, **cp;
757
758         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
759
760         for (cp = &init_net.ipv4.mfc_cache_array[line];
761              (c = *cp) != NULL; cp = &c->next) {
762                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
763                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
764                         write_lock_bh(&mrt_lock);
765                         *cp = c->next;
766                         write_unlock_bh(&mrt_lock);
767
768                         ipmr_cache_free(c);
769                         return 0;
770                 }
771         }
772         return -ENOENT;
773 }
774
775 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
776 {
777         int line;
778         struct mfc_cache *uc, *c, **cp;
779
780         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
781
782         for (cp = &init_net.ipv4.mfc_cache_array[line];
783              (c = *cp) != NULL; cp = &c->next) {
784                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
785                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
786                         break;
787         }
788
789         if (c != NULL) {
790                 write_lock_bh(&mrt_lock);
791                 c->mfc_parent = mfc->mfcc_parent;
792                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
793                 if (!mrtsock)
794                         c->mfc_flags |= MFC_STATIC;
795                 write_unlock_bh(&mrt_lock);
796                 return 0;
797         }
798
799         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
800                 return -EINVAL;
801
802         c = ipmr_cache_alloc(&init_net);
803         if (c == NULL)
804                 return -ENOMEM;
805
806         c->mfc_origin = mfc->mfcc_origin.s_addr;
807         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
808         c->mfc_parent = mfc->mfcc_parent;
809         ipmr_update_thresholds(c, mfc->mfcc_ttls);
810         if (!mrtsock)
811                 c->mfc_flags |= MFC_STATIC;
812
813         write_lock_bh(&mrt_lock);
814         c->next = init_net.ipv4.mfc_cache_array[line];
815         init_net.ipv4.mfc_cache_array[line] = c;
816         write_unlock_bh(&mrt_lock);
817
818         /*
819          *      Check to see if we resolved a queued list. If so we
820          *      need to send on the frames and tidy up.
821          */
822         spin_lock_bh(&mfc_unres_lock);
823         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
824              cp = &uc->next) {
825                 if (net_eq(mfc_net(uc), &init_net) &&
826                     uc->mfc_origin == c->mfc_origin &&
827                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
828                         *cp = uc->next;
829                         atomic_dec(&init_net.ipv4.cache_resolve_queue_len);
830                         break;
831                 }
832         }
833         if (mfc_unres_queue == NULL)
834                 del_timer(&ipmr_expire_timer);
835         spin_unlock_bh(&mfc_unres_lock);
836
837         if (uc) {
838                 ipmr_cache_resolve(uc, c);
839                 ipmr_cache_free(uc);
840         }
841         return 0;
842 }
843
844 /*
845  *      Close the multicast socket, and clear the vif tables etc
846  */
847
848 static void mroute_clean_tables(struct sock *sk)
849 {
850         int i;
851
852         /*
853          *      Shut down all active vif entries
854          */
855         for (i = 0; i < init_net.ipv4.maxvif; i++) {
856                 if (!(init_net.ipv4.vif_table[i].flags&VIFF_STATIC))
857                         vif_delete(i, 0);
858         }
859
860         /*
861          *      Wipe the cache
862          */
863         for (i=0; i<MFC_LINES; i++) {
864                 struct mfc_cache *c, **cp;
865
866                 cp = &init_net.ipv4.mfc_cache_array[i];
867                 while ((c = *cp) != NULL) {
868                         if (c->mfc_flags&MFC_STATIC) {
869                                 cp = &c->next;
870                                 continue;
871                         }
872                         write_lock_bh(&mrt_lock);
873                         *cp = c->next;
874                         write_unlock_bh(&mrt_lock);
875
876                         ipmr_cache_free(c);
877                 }
878         }
879
880         if (atomic_read(&init_net.ipv4.cache_resolve_queue_len) != 0) {
881                 struct mfc_cache *c, **cp;
882
883                 spin_lock_bh(&mfc_unres_lock);
884                 cp = &mfc_unres_queue;
885                 while ((c = *cp) != NULL) {
886                         if (!net_eq(mfc_net(c), &init_net)) {
887                                 cp = &c->next;
888                                 continue;
889                         }
890                         *cp = c->next;
891
892                         ipmr_destroy_unres(c);
893                 }
894                 spin_unlock_bh(&mfc_unres_lock);
895         }
896 }
897
898 static void mrtsock_destruct(struct sock *sk)
899 {
900         rtnl_lock();
901         if (sk == init_net.ipv4.mroute_sk) {
902                 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
903
904                 write_lock_bh(&mrt_lock);
905                 init_net.ipv4.mroute_sk = NULL;
906                 write_unlock_bh(&mrt_lock);
907
908                 mroute_clean_tables(sk);
909         }
910         rtnl_unlock();
911 }
912
913 /*
914  *      Socket options and virtual interface manipulation. The whole
915  *      virtual interface system is a complete heap, but unfortunately
916  *      that's how BSD mrouted happens to think. Maybe one day with a proper
917  *      MOSPF/PIM router set up we can clean this up.
918  */
919
920 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
921 {
922         int ret;
923         struct vifctl vif;
924         struct mfcctl mfc;
925
926         if (optname != MRT_INIT) {
927                 if (sk != init_net.ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
928                         return -EACCES;
929         }
930
931         switch (optname) {
932         case MRT_INIT:
933                 if (sk->sk_type != SOCK_RAW ||
934                     inet_sk(sk)->num != IPPROTO_IGMP)
935                         return -EOPNOTSUPP;
936                 if (optlen != sizeof(int))
937                         return -ENOPROTOOPT;
938
939                 rtnl_lock();
940                 if (init_net.ipv4.mroute_sk) {
941                         rtnl_unlock();
942                         return -EADDRINUSE;
943                 }
944
945                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
946                 if (ret == 0) {
947                         write_lock_bh(&mrt_lock);
948                         init_net.ipv4.mroute_sk = sk;
949                         write_unlock_bh(&mrt_lock);
950
951                         IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
952                 }
953                 rtnl_unlock();
954                 return ret;
955         case MRT_DONE:
956                 if (sk != init_net.ipv4.mroute_sk)
957                         return -EACCES;
958                 return ip_ra_control(sk, 0, NULL);
959         case MRT_ADD_VIF:
960         case MRT_DEL_VIF:
961                 if (optlen != sizeof(vif))
962                         return -EINVAL;
963                 if (copy_from_user(&vif, optval, sizeof(vif)))
964                         return -EFAULT;
965                 if (vif.vifc_vifi >= MAXVIFS)
966                         return -ENFILE;
967                 rtnl_lock();
968                 if (optname == MRT_ADD_VIF) {
969                         ret = vif_add(&vif, sk == init_net.ipv4.mroute_sk);
970                 } else {
971                         ret = vif_delete(vif.vifc_vifi, 0);
972                 }
973                 rtnl_unlock();
974                 return ret;
975
976                 /*
977                  *      Manipulate the forwarding caches. These live
978                  *      in a sort of kernel/user symbiosis.
979                  */
980         case MRT_ADD_MFC:
981         case MRT_DEL_MFC:
982                 if (optlen != sizeof(mfc))
983                         return -EINVAL;
984                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
985                         return -EFAULT;
986                 rtnl_lock();
987                 if (optname == MRT_DEL_MFC)
988                         ret = ipmr_mfc_delete(&mfc);
989                 else
990                         ret = ipmr_mfc_add(&mfc, sk == init_net.ipv4.mroute_sk);
991                 rtnl_unlock();
992                 return ret;
993                 /*
994                  *      Control PIM assert.
995                  */
996         case MRT_ASSERT:
997         {
998                 int v;
999                 if (get_user(v,(int __user *)optval))
1000                         return -EFAULT;
1001                 init_net.ipv4.mroute_do_assert = (v) ? 1 : 0;
1002                 return 0;
1003         }
1004 #ifdef CONFIG_IP_PIMSM
1005         case MRT_PIM:
1006         {
1007                 int v;
1008
1009                 if (get_user(v,(int __user *)optval))
1010                         return -EFAULT;
1011                 v = (v) ? 1 : 0;
1012
1013                 rtnl_lock();
1014                 ret = 0;
1015                 if (v != init_net.ipv4.mroute_do_pim) {
1016                         init_net.ipv4.mroute_do_pim = v;
1017                         init_net.ipv4.mroute_do_assert = v;
1018 #ifdef CONFIG_IP_PIMSM_V2
1019                         if (init_net.ipv4.mroute_do_pim)
1020                                 ret = inet_add_protocol(&pim_protocol,
1021                                                         IPPROTO_PIM);
1022                         else
1023                                 ret = inet_del_protocol(&pim_protocol,
1024                                                         IPPROTO_PIM);
1025                         if (ret < 0)
1026                                 ret = -EAGAIN;
1027 #endif
1028                 }
1029                 rtnl_unlock();
1030                 return ret;
1031         }
1032 #endif
1033         /*
1034          *      Spurious command, or MRT_VERSION which you cannot
1035          *      set.
1036          */
1037         default:
1038                 return -ENOPROTOOPT;
1039         }
1040 }
1041
1042 /*
1043  *      Getsock opt support for the multicast routing system.
1044  */
1045
1046 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1047 {
1048         int olr;
1049         int val;
1050
1051         if (optname != MRT_VERSION &&
1052 #ifdef CONFIG_IP_PIMSM
1053            optname!=MRT_PIM &&
1054 #endif
1055            optname!=MRT_ASSERT)
1056                 return -ENOPROTOOPT;
1057
1058         if (get_user(olr, optlen))
1059                 return -EFAULT;
1060
1061         olr = min_t(unsigned int, olr, sizeof(int));
1062         if (olr < 0)
1063                 return -EINVAL;
1064
1065         if (put_user(olr, optlen))
1066                 return -EFAULT;
1067         if (optname == MRT_VERSION)
1068                 val = 0x0305;
1069 #ifdef CONFIG_IP_PIMSM
1070         else if (optname == MRT_PIM)
1071                 val = init_net.ipv4.mroute_do_pim;
1072 #endif
1073         else
1074                 val = init_net.ipv4.mroute_do_assert;
1075         if (copy_to_user(optval, &val, olr))
1076                 return -EFAULT;
1077         return 0;
1078 }
1079
1080 /*
1081  *      The IP multicast ioctl support routines.
1082  */
1083
1084 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1085 {
1086         struct sioc_sg_req sr;
1087         struct sioc_vif_req vr;
1088         struct vif_device *vif;
1089         struct mfc_cache *c;
1090
1091         switch (cmd) {
1092         case SIOCGETVIFCNT:
1093                 if (copy_from_user(&vr, arg, sizeof(vr)))
1094                         return -EFAULT;
1095                 if (vr.vifi >= init_net.ipv4.maxvif)
1096                         return -EINVAL;
1097                 read_lock(&mrt_lock);
1098                 vif = &init_net.ipv4.vif_table[vr.vifi];
1099                 if (VIF_EXISTS(&init_net, vr.vifi)) {
1100                         vr.icount = vif->pkt_in;
1101                         vr.ocount = vif->pkt_out;
1102                         vr.ibytes = vif->bytes_in;
1103                         vr.obytes = vif->bytes_out;
1104                         read_unlock(&mrt_lock);
1105
1106                         if (copy_to_user(arg, &vr, sizeof(vr)))
1107                                 return -EFAULT;
1108                         return 0;
1109                 }
1110                 read_unlock(&mrt_lock);
1111                 return -EADDRNOTAVAIL;
1112         case SIOCGETSGCNT:
1113                 if (copy_from_user(&sr, arg, sizeof(sr)))
1114                         return -EFAULT;
1115
1116                 read_lock(&mrt_lock);
1117                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1118                 if (c) {
1119                         sr.pktcnt = c->mfc_un.res.pkt;
1120                         sr.bytecnt = c->mfc_un.res.bytes;
1121                         sr.wrong_if = c->mfc_un.res.wrong_if;
1122                         read_unlock(&mrt_lock);
1123
1124                         if (copy_to_user(arg, &sr, sizeof(sr)))
1125                                 return -EFAULT;
1126                         return 0;
1127                 }
1128                 read_unlock(&mrt_lock);
1129                 return -EADDRNOTAVAIL;
1130         default:
1131                 return -ENOIOCTLCMD;
1132         }
1133 }
1134
1135
1136 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1137 {
1138         struct net_device *dev = ptr;
1139         struct vif_device *v;
1140         int ct;
1141
1142         if (!net_eq(dev_net(dev), &init_net))
1143                 return NOTIFY_DONE;
1144
1145         if (event != NETDEV_UNREGISTER)
1146                 return NOTIFY_DONE;
1147         v = &init_net.ipv4.vif_table[0];
1148         for (ct = 0; ct < init_net.ipv4.maxvif; ct++, v++) {
1149                 if (v->dev == dev)
1150                         vif_delete(ct, 1);
1151         }
1152         return NOTIFY_DONE;
1153 }
1154
1155
1156 static struct notifier_block ip_mr_notifier = {
1157         .notifier_call = ipmr_device_event,
1158 };
1159
1160 /*
1161  *      Encapsulate a packet by attaching a valid IPIP header to it.
1162  *      This avoids tunnel drivers and other mess and gives us the speed so
1163  *      important for multicast video.
1164  */
1165
1166 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1167 {
1168         struct iphdr *iph;
1169         struct iphdr *old_iph = ip_hdr(skb);
1170
1171         skb_push(skb, sizeof(struct iphdr));
1172         skb->transport_header = skb->network_header;
1173         skb_reset_network_header(skb);
1174         iph = ip_hdr(skb);
1175
1176         iph->version    =       4;
1177         iph->tos        =       old_iph->tos;
1178         iph->ttl        =       old_iph->ttl;
1179         iph->frag_off   =       0;
1180         iph->daddr      =       daddr;
1181         iph->saddr      =       saddr;
1182         iph->protocol   =       IPPROTO_IPIP;
1183         iph->ihl        =       5;
1184         iph->tot_len    =       htons(skb->len);
1185         ip_select_ident(iph, skb->dst, NULL);
1186         ip_send_check(iph);
1187
1188         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1189         nf_reset(skb);
1190 }
1191
1192 static inline int ipmr_forward_finish(struct sk_buff *skb)
1193 {
1194         struct ip_options * opt = &(IPCB(skb)->opt);
1195
1196         IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1197
1198         if (unlikely(opt->optlen))
1199                 ip_forward_options(skb);
1200
1201         return dst_output(skb);
1202 }
1203
1204 /*
1205  *      Processing handlers for ipmr_forward
1206  */
1207
1208 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1209 {
1210         const struct iphdr *iph = ip_hdr(skb);
1211         struct vif_device *vif = &init_net.ipv4.vif_table[vifi];
1212         struct net_device *dev;
1213         struct rtable *rt;
1214         int    encap = 0;
1215
1216         if (vif->dev == NULL)
1217                 goto out_free;
1218
1219 #ifdef CONFIG_IP_PIMSM
1220         if (vif->flags & VIFF_REGISTER) {
1221                 vif->pkt_out++;
1222                 vif->bytes_out += skb->len;
1223                 vif->dev->stats.tx_bytes += skb->len;
1224                 vif->dev->stats.tx_packets++;
1225                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1226                 kfree_skb(skb);
1227                 return;
1228         }
1229 #endif
1230
1231         if (vif->flags&VIFF_TUNNEL) {
1232                 struct flowi fl = { .oif = vif->link,
1233                                     .nl_u = { .ip4_u =
1234                                               { .daddr = vif->remote,
1235                                                 .saddr = vif->local,
1236                                                 .tos = RT_TOS(iph->tos) } },
1237                                     .proto = IPPROTO_IPIP };
1238                 if (ip_route_output_key(&init_net, &rt, &fl))
1239                         goto out_free;
1240                 encap = sizeof(struct iphdr);
1241         } else {
1242                 struct flowi fl = { .oif = vif->link,
1243                                     .nl_u = { .ip4_u =
1244                                               { .daddr = iph->daddr,
1245                                                 .tos = RT_TOS(iph->tos) } },
1246                                     .proto = IPPROTO_IPIP };
1247                 if (ip_route_output_key(&init_net, &rt, &fl))
1248                         goto out_free;
1249         }
1250
1251         dev = rt->u.dst.dev;
1252
1253         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1254                 /* Do not fragment multicasts. Alas, IPv4 does not
1255                    allow to send ICMP, so that packets will disappear
1256                    to blackhole.
1257                  */
1258
1259                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1260                 ip_rt_put(rt);
1261                 goto out_free;
1262         }
1263
1264         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1265
1266         if (skb_cow(skb, encap)) {
1267                 ip_rt_put(rt);
1268                 goto out_free;
1269         }
1270
1271         vif->pkt_out++;
1272         vif->bytes_out += skb->len;
1273
1274         dst_release(skb->dst);
1275         skb->dst = &rt->u.dst;
1276         ip_decrease_ttl(ip_hdr(skb));
1277
1278         /* FIXME: forward and output firewalls used to be called here.
1279          * What do we do with netfilter? -- RR */
1280         if (vif->flags & VIFF_TUNNEL) {
1281                 ip_encap(skb, vif->local, vif->remote);
1282                 /* FIXME: extra output firewall step used to be here. --RR */
1283                 vif->dev->stats.tx_packets++;
1284                 vif->dev->stats.tx_bytes += skb->len;
1285         }
1286
1287         IPCB(skb)->flags |= IPSKB_FORWARDED;
1288
1289         /*
1290          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1291          * not only before forwarding, but after forwarding on all output
1292          * interfaces. It is clear, if mrouter runs a multicasting
1293          * program, it should receive packets not depending to what interface
1294          * program is joined.
1295          * If we will not make it, the program will have to join on all
1296          * interfaces. On the other hand, multihoming host (or router, but
1297          * not mrouter) cannot join to more than one interface - it will
1298          * result in receiving multiple packets.
1299          */
1300         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1301                 ipmr_forward_finish);
1302         return;
1303
1304 out_free:
1305         kfree_skb(skb);
1306         return;
1307 }
1308
1309 static int ipmr_find_vif(struct net_device *dev)
1310 {
1311         int ct;
1312         for (ct = init_net.ipv4.maxvif-1; ct >= 0; ct--) {
1313                 if (init_net.ipv4.vif_table[ct].dev == dev)
1314                         break;
1315         }
1316         return ct;
1317 }
1318
1319 /* "local" means that we should preserve one skb (for local delivery) */
1320
1321 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1322 {
1323         int psend = -1;
1324         int vif, ct;
1325
1326         vif = cache->mfc_parent;
1327         cache->mfc_un.res.pkt++;
1328         cache->mfc_un.res.bytes += skb->len;
1329
1330         /*
1331          * Wrong interface: drop packet and (maybe) send PIM assert.
1332          */
1333         if (init_net.ipv4.vif_table[vif].dev != skb->dev) {
1334                 int true_vifi;
1335
1336                 if (skb->rtable->fl.iif == 0) {
1337                         /* It is our own packet, looped back.
1338                            Very complicated situation...
1339
1340                            The best workaround until routing daemons will be
1341                            fixed is not to redistribute packet, if it was
1342                            send through wrong interface. It means, that
1343                            multicast applications WILL NOT work for
1344                            (S,G), which have default multicast route pointing
1345                            to wrong oif. In any case, it is not a good
1346                            idea to use multicasting applications on router.
1347                          */
1348                         goto dont_forward;
1349                 }
1350
1351                 cache->mfc_un.res.wrong_if++;
1352                 true_vifi = ipmr_find_vif(skb->dev);
1353
1354                 if (true_vifi >= 0 && init_net.ipv4.mroute_do_assert &&
1355                     /* pimsm uses asserts, when switching from RPT to SPT,
1356                        so that we cannot check that packet arrived on an oif.
1357                        It is bad, but otherwise we would need to move pretty
1358                        large chunk of pimd to kernel. Ough... --ANK
1359                      */
1360                     (init_net.ipv4.mroute_do_pim ||
1361                      cache->mfc_un.res.ttls[true_vifi] < 255) &&
1362                     time_after(jiffies,
1363                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1364                         cache->mfc_un.res.last_assert = jiffies;
1365                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1366                 }
1367                 goto dont_forward;
1368         }
1369
1370         init_net.ipv4.vif_table[vif].pkt_in++;
1371         init_net.ipv4.vif_table[vif].bytes_in += skb->len;
1372
1373         /*
1374          *      Forward the frame
1375          */
1376         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1377                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1378                         if (psend != -1) {
1379                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1380                                 if (skb2)
1381                                         ipmr_queue_xmit(skb2, cache, psend);
1382                         }
1383                         psend = ct;
1384                 }
1385         }
1386         if (psend != -1) {
1387                 if (local) {
1388                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1389                         if (skb2)
1390                                 ipmr_queue_xmit(skb2, cache, psend);
1391                 } else {
1392                         ipmr_queue_xmit(skb, cache, psend);
1393                         return 0;
1394                 }
1395         }
1396
1397 dont_forward:
1398         if (!local)
1399                 kfree_skb(skb);
1400         return 0;
1401 }
1402
1403
1404 /*
1405  *      Multicast packets for forwarding arrive here
1406  */
1407
1408 int ip_mr_input(struct sk_buff *skb)
1409 {
1410         struct mfc_cache *cache;
1411         int local = skb->rtable->rt_flags&RTCF_LOCAL;
1412
1413         /* Packet is looped back after forward, it should not be
1414            forwarded second time, but still can be delivered locally.
1415          */
1416         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1417                 goto dont_forward;
1418
1419         if (!local) {
1420                     if (IPCB(skb)->opt.router_alert) {
1421                             if (ip_call_ra_chain(skb))
1422                                     return 0;
1423                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1424                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1425                                Cisco IOS <= 11.2(8)) do not put router alert
1426                                option to IGMP packets destined to routable
1427                                groups. It is very bad, because it means
1428                                that we can forward NO IGMP messages.
1429                              */
1430                             read_lock(&mrt_lock);
1431                             if (init_net.ipv4.mroute_sk) {
1432                                     nf_reset(skb);
1433                                     raw_rcv(init_net.ipv4.mroute_sk, skb);
1434                                     read_unlock(&mrt_lock);
1435                                     return 0;
1436                             }
1437                             read_unlock(&mrt_lock);
1438                     }
1439         }
1440
1441         read_lock(&mrt_lock);
1442         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1443
1444         /*
1445          *      No usable cache entry
1446          */
1447         if (cache == NULL) {
1448                 int vif;
1449
1450                 if (local) {
1451                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1452                         ip_local_deliver(skb);
1453                         if (skb2 == NULL) {
1454                                 read_unlock(&mrt_lock);
1455                                 return -ENOBUFS;
1456                         }
1457                         skb = skb2;
1458                 }
1459
1460                 vif = ipmr_find_vif(skb->dev);
1461                 if (vif >= 0) {
1462                         int err = ipmr_cache_unresolved(vif, skb);
1463                         read_unlock(&mrt_lock);
1464
1465                         return err;
1466                 }
1467                 read_unlock(&mrt_lock);
1468                 kfree_skb(skb);
1469                 return -ENODEV;
1470         }
1471
1472         ip_mr_forward(skb, cache, local);
1473
1474         read_unlock(&mrt_lock);
1475
1476         if (local)
1477                 return ip_local_deliver(skb);
1478
1479         return 0;
1480
1481 dont_forward:
1482         if (local)
1483                 return ip_local_deliver(skb);
1484         kfree_skb(skb);
1485         return 0;
1486 }
1487
1488 #ifdef CONFIG_IP_PIMSM
1489 static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1490 {
1491         struct net_device *reg_dev = NULL;
1492         struct iphdr *encap;
1493
1494         encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1495         /*
1496            Check that:
1497            a. packet is really destinted to a multicast group
1498            b. packet is not a NULL-REGISTER
1499            c. packet is not truncated
1500          */
1501         if (!ipv4_is_multicast(encap->daddr) ||
1502             encap->tot_len == 0 ||
1503             ntohs(encap->tot_len) + pimlen > skb->len)
1504                 return 1;
1505
1506         read_lock(&mrt_lock);
1507         if (init_net.ipv4.mroute_reg_vif_num >= 0)
1508                 reg_dev = init_net.ipv4.vif_table[init_net.ipv4.mroute_reg_vif_num].dev;
1509         if (reg_dev)
1510                 dev_hold(reg_dev);
1511         read_unlock(&mrt_lock);
1512
1513         if (reg_dev == NULL)
1514                 return 1;
1515
1516         skb->mac_header = skb->network_header;
1517         skb_pull(skb, (u8*)encap - skb->data);
1518         skb_reset_network_header(skb);
1519         skb->dev = reg_dev;
1520         skb->protocol = htons(ETH_P_IP);
1521         skb->ip_summed = 0;
1522         skb->pkt_type = PACKET_HOST;
1523         dst_release(skb->dst);
1524         skb->dst = NULL;
1525         reg_dev->stats.rx_bytes += skb->len;
1526         reg_dev->stats.rx_packets++;
1527         nf_reset(skb);
1528         netif_rx(skb);
1529         dev_put(reg_dev);
1530
1531         return 0;
1532 }
1533 #endif
1534
1535 #ifdef CONFIG_IP_PIMSM_V1
1536 /*
1537  * Handle IGMP messages of PIMv1
1538  */
1539
1540 int pim_rcv_v1(struct sk_buff * skb)
1541 {
1542         struct igmphdr *pim;
1543
1544         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1545                 goto drop;
1546
1547         pim = igmp_hdr(skb);
1548
1549         if (!init_net.ipv4.mroute_do_pim ||
1550             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1551                 goto drop;
1552
1553         if (__pim_rcv(skb, sizeof(*pim))) {
1554 drop:
1555                 kfree_skb(skb);
1556         }
1557         return 0;
1558 }
1559 #endif
1560
1561 #ifdef CONFIG_IP_PIMSM_V2
1562 static int pim_rcv(struct sk_buff * skb)
1563 {
1564         struct pimreghdr *pim;
1565
1566         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1567                 goto drop;
1568
1569         pim = (struct pimreghdr *)skb_transport_header(skb);
1570         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1571             (pim->flags&PIM_NULL_REGISTER) ||
1572             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1573              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1574                 goto drop;
1575
1576         if (__pim_rcv(skb, sizeof(*pim))) {
1577 drop:
1578                 kfree_skb(skb);
1579         }
1580         return 0;
1581 }
1582 #endif
1583
1584 static int
1585 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1586 {
1587         int ct;
1588         struct rtnexthop *nhp;
1589         struct net_device *dev = init_net.ipv4.vif_table[c->mfc_parent].dev;
1590         u8 *b = skb_tail_pointer(skb);
1591         struct rtattr *mp_head;
1592
1593         if (dev)
1594                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1595
1596         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1597
1598         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1599                 if (c->mfc_un.res.ttls[ct] < 255) {
1600                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1601                                 goto rtattr_failure;
1602                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1603                         nhp->rtnh_flags = 0;
1604                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1605                         nhp->rtnh_ifindex = init_net.ipv4.vif_table[ct].dev->ifindex;
1606                         nhp->rtnh_len = sizeof(*nhp);
1607                 }
1608         }
1609         mp_head->rta_type = RTA_MULTIPATH;
1610         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1611         rtm->rtm_type = RTN_MULTICAST;
1612         return 1;
1613
1614 rtattr_failure:
1615         nlmsg_trim(skb, b);
1616         return -EMSGSIZE;
1617 }
1618
1619 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1620 {
1621         int err;
1622         struct mfc_cache *cache;
1623         struct rtable *rt = skb->rtable;
1624
1625         read_lock(&mrt_lock);
1626         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1627
1628         if (cache == NULL) {
1629                 struct sk_buff *skb2;
1630                 struct iphdr *iph;
1631                 struct net_device *dev;
1632                 int vif;
1633
1634                 if (nowait) {
1635                         read_unlock(&mrt_lock);
1636                         return -EAGAIN;
1637                 }
1638
1639                 dev = skb->dev;
1640                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1641                         read_unlock(&mrt_lock);
1642                         return -ENODEV;
1643                 }
1644                 skb2 = skb_clone(skb, GFP_ATOMIC);
1645                 if (!skb2) {
1646                         read_unlock(&mrt_lock);
1647                         return -ENOMEM;
1648                 }
1649
1650                 skb_push(skb2, sizeof(struct iphdr));
1651                 skb_reset_network_header(skb2);
1652                 iph = ip_hdr(skb2);
1653                 iph->ihl = sizeof(struct iphdr) >> 2;
1654                 iph->saddr = rt->rt_src;
1655                 iph->daddr = rt->rt_dst;
1656                 iph->version = 0;
1657                 err = ipmr_cache_unresolved(vif, skb2);
1658                 read_unlock(&mrt_lock);
1659                 return err;
1660         }
1661
1662         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1663                 cache->mfc_flags |= MFC_NOTIFY;
1664         err = ipmr_fill_mroute(skb, cache, rtm);
1665         read_unlock(&mrt_lock);
1666         return err;
1667 }
1668
1669 #ifdef CONFIG_PROC_FS
1670 /*
1671  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1672  */
1673 struct ipmr_vif_iter {
1674         int ct;
1675 };
1676
1677 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1678                                            loff_t pos)
1679 {
1680         for (iter->ct = 0; iter->ct < init_net.ipv4.maxvif; ++iter->ct) {
1681                 if (!VIF_EXISTS(&init_net, iter->ct))
1682                         continue;
1683                 if (pos-- == 0)
1684                         return &init_net.ipv4.vif_table[iter->ct];
1685         }
1686         return NULL;
1687 }
1688
1689 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1690         __acquires(mrt_lock)
1691 {
1692         read_lock(&mrt_lock);
1693         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1694                 : SEQ_START_TOKEN;
1695 }
1696
1697 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1698 {
1699         struct ipmr_vif_iter *iter = seq->private;
1700
1701         ++*pos;
1702         if (v == SEQ_START_TOKEN)
1703                 return ipmr_vif_seq_idx(iter, 0);
1704
1705         while (++iter->ct < init_net.ipv4.maxvif) {
1706                 if (!VIF_EXISTS(&init_net, iter->ct))
1707                         continue;
1708                 return &init_net.ipv4.vif_table[iter->ct];
1709         }
1710         return NULL;
1711 }
1712
1713 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1714         __releases(mrt_lock)
1715 {
1716         read_unlock(&mrt_lock);
1717 }
1718
1719 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1720 {
1721         if (v == SEQ_START_TOKEN) {
1722                 seq_puts(seq,
1723                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1724         } else {
1725                 const struct vif_device *vif = v;
1726                 const char *name =  vif->dev ? vif->dev->name : "none";
1727
1728                 seq_printf(seq,
1729                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1730                            vif - init_net.ipv4.vif_table,
1731                            name, vif->bytes_in, vif->pkt_in,
1732                            vif->bytes_out, vif->pkt_out,
1733                            vif->flags, vif->local, vif->remote);
1734         }
1735         return 0;
1736 }
1737
1738 static const struct seq_operations ipmr_vif_seq_ops = {
1739         .start = ipmr_vif_seq_start,
1740         .next  = ipmr_vif_seq_next,
1741         .stop  = ipmr_vif_seq_stop,
1742         .show  = ipmr_vif_seq_show,
1743 };
1744
1745 static int ipmr_vif_open(struct inode *inode, struct file *file)
1746 {
1747         return seq_open_private(file, &ipmr_vif_seq_ops,
1748                         sizeof(struct ipmr_vif_iter));
1749 }
1750
1751 static const struct file_operations ipmr_vif_fops = {
1752         .owner   = THIS_MODULE,
1753         .open    = ipmr_vif_open,
1754         .read    = seq_read,
1755         .llseek  = seq_lseek,
1756         .release = seq_release_private,
1757 };
1758
1759 struct ipmr_mfc_iter {
1760         struct mfc_cache **cache;
1761         int ct;
1762 };
1763
1764
1765 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1766 {
1767         struct mfc_cache *mfc;
1768
1769         it->cache = init_net.ipv4.mfc_cache_array;
1770         read_lock(&mrt_lock);
1771         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1772                 for (mfc = init_net.ipv4.mfc_cache_array[it->ct];
1773                      mfc; mfc = mfc->next)
1774                         if (pos-- == 0)
1775                                 return mfc;
1776         read_unlock(&mrt_lock);
1777
1778         it->cache = &mfc_unres_queue;
1779         spin_lock_bh(&mfc_unres_lock);
1780         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1781                 if (pos-- == 0)
1782                         return mfc;
1783         spin_unlock_bh(&mfc_unres_lock);
1784
1785         it->cache = NULL;
1786         return NULL;
1787 }
1788
1789
1790 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1791 {
1792         struct ipmr_mfc_iter *it = seq->private;
1793         it->cache = NULL;
1794         it->ct = 0;
1795         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1796                 : SEQ_START_TOKEN;
1797 }
1798
1799 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1800 {
1801         struct mfc_cache *mfc = v;
1802         struct ipmr_mfc_iter *it = seq->private;
1803
1804         ++*pos;
1805
1806         if (v == SEQ_START_TOKEN)
1807                 return ipmr_mfc_seq_idx(seq->private, 0);
1808
1809         if (mfc->next)
1810                 return mfc->next;
1811
1812         if (it->cache == &mfc_unres_queue)
1813                 goto end_of_list;
1814
1815         BUG_ON(it->cache != init_net.ipv4.mfc_cache_array);
1816
1817         while (++it->ct < MFC_LINES) {
1818                 mfc = init_net.ipv4.mfc_cache_array[it->ct];
1819                 if (mfc)
1820                         return mfc;
1821         }
1822
1823         /* exhausted cache_array, show unresolved */
1824         read_unlock(&mrt_lock);
1825         it->cache = &mfc_unres_queue;
1826         it->ct = 0;
1827
1828         spin_lock_bh(&mfc_unres_lock);
1829         mfc = mfc_unres_queue;
1830         if (mfc)
1831                 return mfc;
1832
1833  end_of_list:
1834         spin_unlock_bh(&mfc_unres_lock);
1835         it->cache = NULL;
1836
1837         return NULL;
1838 }
1839
1840 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1841 {
1842         struct ipmr_mfc_iter *it = seq->private;
1843
1844         if (it->cache == &mfc_unres_queue)
1845                 spin_unlock_bh(&mfc_unres_lock);
1846         else if (it->cache == init_net.ipv4.mfc_cache_array)
1847                 read_unlock(&mrt_lock);
1848 }
1849
1850 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1851 {
1852         int n;
1853
1854         if (v == SEQ_START_TOKEN) {
1855                 seq_puts(seq,
1856                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1857         } else {
1858                 const struct mfc_cache *mfc = v;
1859                 const struct ipmr_mfc_iter *it = seq->private;
1860
1861                 seq_printf(seq, "%08lX %08lX %-3hd",
1862                            (unsigned long) mfc->mfc_mcastgrp,
1863                            (unsigned long) mfc->mfc_origin,
1864                            mfc->mfc_parent);
1865
1866                 if (it->cache != &mfc_unres_queue) {
1867                         seq_printf(seq, " %8lu %8lu %8lu",
1868                                    mfc->mfc_un.res.pkt,
1869                                    mfc->mfc_un.res.bytes,
1870                                    mfc->mfc_un.res.wrong_if);
1871                         for (n = mfc->mfc_un.res.minvif;
1872                              n < mfc->mfc_un.res.maxvif; n++ ) {
1873                                 if (VIF_EXISTS(&init_net, n) &&
1874                                     mfc->mfc_un.res.ttls[n] < 255)
1875                                         seq_printf(seq,
1876                                            " %2d:%-3d",
1877                                            n, mfc->mfc_un.res.ttls[n]);
1878                         }
1879                 } else {
1880                         /* unresolved mfc_caches don't contain
1881                          * pkt, bytes and wrong_if values
1882                          */
1883                         seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
1884                 }
1885                 seq_putc(seq, '\n');
1886         }
1887         return 0;
1888 }
1889
1890 static const struct seq_operations ipmr_mfc_seq_ops = {
1891         .start = ipmr_mfc_seq_start,
1892         .next  = ipmr_mfc_seq_next,
1893         .stop  = ipmr_mfc_seq_stop,
1894         .show  = ipmr_mfc_seq_show,
1895 };
1896
1897 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1898 {
1899         return seq_open_private(file, &ipmr_mfc_seq_ops,
1900                         sizeof(struct ipmr_mfc_iter));
1901 }
1902
1903 static const struct file_operations ipmr_mfc_fops = {
1904         .owner   = THIS_MODULE,
1905         .open    = ipmr_mfc_open,
1906         .read    = seq_read,
1907         .llseek  = seq_lseek,
1908         .release = seq_release_private,
1909 };
1910 #endif
1911
1912 #ifdef CONFIG_IP_PIMSM_V2
1913 static struct net_protocol pim_protocol = {
1914         .handler        =       pim_rcv,
1915 };
1916 #endif
1917
1918
1919 /*
1920  *      Setup for IP multicast routing
1921  */
1922 static int __net_init ipmr_net_init(struct net *net)
1923 {
1924         int err = 0;
1925
1926         net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device),
1927                                       GFP_KERNEL);
1928         if (!net->ipv4.vif_table) {
1929                 err = -ENOMEM;
1930                 goto fail;
1931         }
1932
1933         /* Forwarding cache */
1934         net->ipv4.mfc_cache_array = kcalloc(MFC_LINES,
1935                                             sizeof(struct mfc_cache *),
1936                                             GFP_KERNEL);
1937         if (!net->ipv4.mfc_cache_array) {
1938                 err = -ENOMEM;
1939                 goto fail_mfc_cache;
1940         }
1941
1942 #ifdef CONFIG_IP_PIMSM
1943         net->ipv4.mroute_reg_vif_num = -1;
1944 #endif
1945         return 0;
1946
1947 fail_mfc_cache:
1948         kfree(net->ipv4.vif_table);
1949 fail:
1950         return err;
1951 }
1952
1953 static void __net_exit ipmr_net_exit(struct net *net)
1954 {
1955         kfree(net->ipv4.mfc_cache_array);
1956         kfree(net->ipv4.vif_table);
1957 }
1958
1959 static struct pernet_operations ipmr_net_ops = {
1960         .init = ipmr_net_init,
1961         .exit = ipmr_net_exit,
1962 };
1963
1964 int __init ip_mr_init(void)
1965 {
1966         int err;
1967
1968         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1969                                        sizeof(struct mfc_cache),
1970                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1971                                        NULL);
1972         if (!mrt_cachep)
1973                 return -ENOMEM;
1974
1975         err = register_pernet_subsys(&ipmr_net_ops);
1976         if (err)
1977                 goto reg_pernet_fail;
1978
1979         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1980         err = register_netdevice_notifier(&ip_mr_notifier);
1981         if (err)
1982                 goto reg_notif_fail;
1983 #ifdef CONFIG_PROC_FS
1984         err = -ENOMEM;
1985         if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1986                 goto proc_vif_fail;
1987         if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1988                 goto proc_cache_fail;
1989 #endif
1990         return 0;
1991 #ifdef CONFIG_PROC_FS
1992 proc_cache_fail:
1993         proc_net_remove(&init_net, "ip_mr_vif");
1994 proc_vif_fail:
1995         unregister_netdevice_notifier(&ip_mr_notifier);
1996 #endif
1997 reg_notif_fail:
1998         del_timer(&ipmr_expire_timer);
1999         unregister_pernet_subsys(&ipmr_net_ops);
2000 reg_pernet_fail:
2001         kmem_cache_destroy(mrt_cachep);
2002         return err;
2003 }