Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48
49 #if IS_ENABLED(CONFIG_IPV6)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54
55 /*
56    Problems & solutions
57    --------------------
58
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is a good
68    solution, but it supposes maintaining new variable in ALL
69    skb, even if no tunneling is used.
70
71    Current solution: xmit_recursion breaks dead loops. This is a percpu
72    counter, since when we enter the first ndo_xmit(), cpu migration is
73    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
74
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, ttl is not solution at all.
95
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    rapidly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108
109
110
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117
118    Alexey Kuznetsov.
119  */
120
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125
126 /* Fallback tunnel: no source, no destination, no key, no options */
127
128 #define HASH_SIZE  16
129
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133
134         struct net_device *fb_tunnel_dev;
135 };
136
137 /* Tunnel hash table */
138
139 /*
140    4 hash tables:
141
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156
157 #define tunnels_r_l     tunnels[3]
158 #define tunnels_r       tunnels[2]
159 #define tunnels_l       tunnels[1]
160 #define tunnels_wc      tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164
165 #define for_each_ip_tunnel_rcu(start) \
166         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167
168 /* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct pcpu_tstats {
170         unsigned long   rx_packets;
171         unsigned long   rx_bytes;
172         unsigned long   tx_packets;
173         unsigned long   tx_bytes;
174 } __attribute__((aligned(4*sizeof(unsigned long))));
175
176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177 {
178         struct pcpu_tstats sum = { 0 };
179         int i;
180
181         for_each_possible_cpu(i) {
182                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183
184                 sum.rx_packets += tstats->rx_packets;
185                 sum.rx_bytes   += tstats->rx_bytes;
186                 sum.tx_packets += tstats->tx_packets;
187                 sum.tx_bytes   += tstats->tx_bytes;
188         }
189         dev->stats.rx_packets = sum.rx_packets;
190         dev->stats.rx_bytes   = sum.rx_bytes;
191         dev->stats.tx_packets = sum.tx_packets;
192         dev->stats.tx_bytes   = sum.tx_bytes;
193         return &dev->stats;
194 }
195
196 /* Given src, dst and key, find appropriate for input tunnel. */
197
198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
199                                               __be32 remote, __be32 local,
200                                               __be32 key, __be16 gre_proto)
201 {
202         struct net *net = dev_net(dev);
203         int link = dev->ifindex;
204         unsigned int h0 = HASH(remote);
205         unsigned int h1 = HASH(key);
206         struct ip_tunnel *t, *cand = NULL;
207         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
208         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
209                        ARPHRD_ETHER : ARPHRD_IPGRE;
210         int score, cand_score = 4;
211
212         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
213                 if (local != t->parms.iph.saddr ||
214                     remote != t->parms.iph.daddr ||
215                     key != t->parms.i_key ||
216                     !(t->dev->flags & IFF_UP))
217                         continue;
218
219                 if (t->dev->type != ARPHRD_IPGRE &&
220                     t->dev->type != dev_type)
221                         continue;
222
223                 score = 0;
224                 if (t->parms.link != link)
225                         score |= 1;
226                 if (t->dev->type != dev_type)
227                         score |= 2;
228                 if (score == 0)
229                         return t;
230
231                 if (score < cand_score) {
232                         cand = t;
233                         cand_score = score;
234                 }
235         }
236
237         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
238                 if (remote != t->parms.iph.daddr ||
239                     key != t->parms.i_key ||
240                     !(t->dev->flags & IFF_UP))
241                         continue;
242
243                 if (t->dev->type != ARPHRD_IPGRE &&
244                     t->dev->type != dev_type)
245                         continue;
246
247                 score = 0;
248                 if (t->parms.link != link)
249                         score |= 1;
250                 if (t->dev->type != dev_type)
251                         score |= 2;
252                 if (score == 0)
253                         return t;
254
255                 if (score < cand_score) {
256                         cand = t;
257                         cand_score = score;
258                 }
259         }
260
261         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
262                 if ((local != t->parms.iph.saddr &&
263                      (local != t->parms.iph.daddr ||
264                       !ipv4_is_multicast(local))) ||
265                     key != t->parms.i_key ||
266                     !(t->dev->flags & IFF_UP))
267                         continue;
268
269                 if (t->dev->type != ARPHRD_IPGRE &&
270                     t->dev->type != dev_type)
271                         continue;
272
273                 score = 0;
274                 if (t->parms.link != link)
275                         score |= 1;
276                 if (t->dev->type != dev_type)
277                         score |= 2;
278                 if (score == 0)
279                         return t;
280
281                 if (score < cand_score) {
282                         cand = t;
283                         cand_score = score;
284                 }
285         }
286
287         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
288                 if (t->parms.i_key != key ||
289                     !(t->dev->flags & IFF_UP))
290                         continue;
291
292                 if (t->dev->type != ARPHRD_IPGRE &&
293                     t->dev->type != dev_type)
294                         continue;
295
296                 score = 0;
297                 if (t->parms.link != link)
298                         score |= 1;
299                 if (t->dev->type != dev_type)
300                         score |= 2;
301                 if (score == 0)
302                         return t;
303
304                 if (score < cand_score) {
305                         cand = t;
306                         cand_score = score;
307                 }
308         }
309
310         if (cand != NULL)
311                 return cand;
312
313         dev = ign->fb_tunnel_dev;
314         if (dev->flags & IFF_UP)
315                 return netdev_priv(dev);
316
317         return NULL;
318 }
319
320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
321                 struct ip_tunnel_parm *parms)
322 {
323         __be32 remote = parms->iph.daddr;
324         __be32 local = parms->iph.saddr;
325         __be32 key = parms->i_key;
326         unsigned int h = HASH(key);
327         int prio = 0;
328
329         if (local)
330                 prio |= 1;
331         if (remote && !ipv4_is_multicast(remote)) {
332                 prio |= 2;
333                 h ^= HASH(remote);
334         }
335
336         return &ign->tunnels[prio][h];
337 }
338
339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
340                 struct ip_tunnel *t)
341 {
342         return __ipgre_bucket(ign, &t->parms);
343 }
344
345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
346 {
347         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
348
349         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
350         rcu_assign_pointer(*tp, t);
351 }
352
353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
354 {
355         struct ip_tunnel __rcu **tp;
356         struct ip_tunnel *iter;
357
358         for (tp = ipgre_bucket(ign, t);
359              (iter = rtnl_dereference(*tp)) != NULL;
360              tp = &iter->next) {
361                 if (t == iter) {
362                         rcu_assign_pointer(*tp, t->next);
363                         break;
364                 }
365         }
366 }
367
368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
369                                            struct ip_tunnel_parm *parms,
370                                            int type)
371 {
372         __be32 remote = parms->iph.daddr;
373         __be32 local = parms->iph.saddr;
374         __be32 key = parms->i_key;
375         int link = parms->link;
376         struct ip_tunnel *t;
377         struct ip_tunnel __rcu **tp;
378         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
379
380         for (tp = __ipgre_bucket(ign, parms);
381              (t = rtnl_dereference(*tp)) != NULL;
382              tp = &t->next)
383                 if (local == t->parms.iph.saddr &&
384                     remote == t->parms.iph.daddr &&
385                     key == t->parms.i_key &&
386                     link == t->parms.link &&
387                     type == t->dev->type)
388                         break;
389
390         return t;
391 }
392
393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
394                 struct ip_tunnel_parm *parms, int create)
395 {
396         struct ip_tunnel *t, *nt;
397         struct net_device *dev;
398         char name[IFNAMSIZ];
399         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400
401         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
402         if (t || !create)
403                 return t;
404
405         if (parms->name[0])
406                 strlcpy(name, parms->name, IFNAMSIZ);
407         else
408                 strcpy(name, "gre%d");
409
410         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411         if (!dev)
412                 return NULL;
413
414         dev_net_set(dev, net);
415
416         nt = netdev_priv(dev);
417         nt->parms = *parms;
418         dev->rtnl_link_ops = &ipgre_link_ops;
419
420         dev->mtu = ipgre_tunnel_bind_dev(dev);
421
422         if (register_netdevice(dev) < 0)
423                 goto failed_free;
424
425         /* Can use a lockless transmit, unless we generate output sequences */
426         if (!(nt->parms.o_flags & GRE_SEQ))
427                 dev->features |= NETIF_F_LLTX;
428
429         dev_hold(dev);
430         ipgre_tunnel_link(ign, nt);
431         return nt;
432
433 failed_free:
434         free_netdev(dev);
435         return NULL;
436 }
437
438 static void ipgre_tunnel_uninit(struct net_device *dev)
439 {
440         struct net *net = dev_net(dev);
441         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
442
443         ipgre_tunnel_unlink(ign, netdev_priv(dev));
444         dev_put(dev);
445 }
446
447
448 static void ipgre_err(struct sk_buff *skb, u32 info)
449 {
450
451 /* All the routers (except for Linux) return only
452    8 bytes of packet payload. It means, that precise relaying of
453    ICMP in the real Internet is absolutely infeasible.
454
455    Moreover, Cisco "wise men" put GRE key to the third word
456    in GRE header. It makes impossible maintaining even soft state for keyed
457    GRE tunnels with enabled checksum. Tell them "thank you".
458
459    Well, I wonder, rfc1812 was written by Cisco employee,
460    what the hell these idiots break standards established
461    by themselves???
462  */
463
464         const struct iphdr *iph = (const struct iphdr *)skb->data;
465         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
466         int grehlen = (iph->ihl<<2) + 4;
467         const int type = icmp_hdr(skb)->type;
468         const int code = icmp_hdr(skb)->code;
469         struct ip_tunnel *t;
470         __be16 flags;
471
472         flags = p[0];
473         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
474                 if (flags&(GRE_VERSION|GRE_ROUTING))
475                         return;
476                 if (flags&GRE_KEY) {
477                         grehlen += 4;
478                         if (flags&GRE_CSUM)
479                                 grehlen += 4;
480                 }
481         }
482
483         /* If only 8 bytes returned, keyed message will be dropped here */
484         if (skb_headlen(skb) < grehlen)
485                 return;
486
487         switch (type) {
488         default:
489         case ICMP_PARAMETERPROB:
490                 return;
491
492         case ICMP_DEST_UNREACH:
493                 switch (code) {
494                 case ICMP_SR_FAILED:
495                 case ICMP_PORT_UNREACH:
496                         /* Impossible event. */
497                         return;
498                 case ICMP_FRAG_NEEDED:
499                         /* Soft state for pmtu is maintained by IP core. */
500                         return;
501                 default:
502                         /* All others are translated to HOST_UNREACH.
503                            rfc2003 contains "deep thoughts" about NET_UNREACH,
504                            I believe they are just ether pollution. --ANK
505                          */
506                         break;
507                 }
508                 break;
509         case ICMP_TIME_EXCEEDED:
510                 if (code != ICMP_EXC_TTL)
511                         return;
512                 break;
513         }
514
515         rcu_read_lock();
516         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
517                                 flags & GRE_KEY ?
518                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
519                                 p[1]);
520         if (t == NULL || t->parms.iph.daddr == 0 ||
521             ipv4_is_multicast(t->parms.iph.daddr))
522                 goto out;
523
524         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
525                 goto out;
526
527         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
528                 t->err_count++;
529         else
530                 t->err_count = 1;
531         t->err_time = jiffies;
532 out:
533         rcu_read_unlock();
534 }
535
536 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
537 {
538         if (INET_ECN_is_ce(iph->tos)) {
539                 if (skb->protocol == htons(ETH_P_IP)) {
540                         IP_ECN_set_ce(ip_hdr(skb));
541                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
542                         IP6_ECN_set_ce(ipv6_hdr(skb));
543                 }
544         }
545 }
546
547 static inline u8
548 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
549 {
550         u8 inner = 0;
551         if (skb->protocol == htons(ETH_P_IP))
552                 inner = old_iph->tos;
553         else if (skb->protocol == htons(ETH_P_IPV6))
554                 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
555         return INET_ECN_encapsulate(tos, inner);
556 }
557
558 static int ipgre_rcv(struct sk_buff *skb)
559 {
560         const struct iphdr *iph;
561         u8     *h;
562         __be16    flags;
563         __sum16   csum = 0;
564         __be32 key = 0;
565         u32    seqno = 0;
566         struct ip_tunnel *tunnel;
567         int    offset = 4;
568         __be16 gre_proto;
569
570         if (!pskb_may_pull(skb, 16))
571                 goto drop_nolock;
572
573         iph = ip_hdr(skb);
574         h = skb->data;
575         flags = *(__be16*)h;
576
577         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
578                 /* - Version must be 0.
579                    - We do not support routing headers.
580                  */
581                 if (flags&(GRE_VERSION|GRE_ROUTING))
582                         goto drop_nolock;
583
584                 if (flags&GRE_CSUM) {
585                         switch (skb->ip_summed) {
586                         case CHECKSUM_COMPLETE:
587                                 csum = csum_fold(skb->csum);
588                                 if (!csum)
589                                         break;
590                                 /* fall through */
591                         case CHECKSUM_NONE:
592                                 skb->csum = 0;
593                                 csum = __skb_checksum_complete(skb);
594                                 skb->ip_summed = CHECKSUM_COMPLETE;
595                         }
596                         offset += 4;
597                 }
598                 if (flags&GRE_KEY) {
599                         key = *(__be32*)(h + offset);
600                         offset += 4;
601                 }
602                 if (flags&GRE_SEQ) {
603                         seqno = ntohl(*(__be32*)(h + offset));
604                         offset += 4;
605                 }
606         }
607
608         gre_proto = *(__be16 *)(h + 2);
609
610         rcu_read_lock();
611         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
612                                           iph->saddr, iph->daddr, key,
613                                           gre_proto))) {
614                 struct pcpu_tstats *tstats;
615
616                 secpath_reset(skb);
617
618                 skb->protocol = gre_proto;
619                 /* WCCP version 1 and 2 protocol decoding.
620                  * - Change protocol to IP
621                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
622                  */
623                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
624                         skb->protocol = htons(ETH_P_IP);
625                         if ((*(h + offset) & 0xF0) != 0x40)
626                                 offset += 4;
627                 }
628
629                 skb->mac_header = skb->network_header;
630                 __pskb_pull(skb, offset);
631                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
632                 skb->pkt_type = PACKET_HOST;
633 #ifdef CONFIG_NET_IPGRE_BROADCAST
634                 if (ipv4_is_multicast(iph->daddr)) {
635                         /* Looped back packet, drop it! */
636                         if (rt_is_output_route(skb_rtable(skb)))
637                                 goto drop;
638                         tunnel->dev->stats.multicast++;
639                         skb->pkt_type = PACKET_BROADCAST;
640                 }
641 #endif
642
643                 if (((flags&GRE_CSUM) && csum) ||
644                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
645                         tunnel->dev->stats.rx_crc_errors++;
646                         tunnel->dev->stats.rx_errors++;
647                         goto drop;
648                 }
649                 if (tunnel->parms.i_flags&GRE_SEQ) {
650                         if (!(flags&GRE_SEQ) ||
651                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
652                                 tunnel->dev->stats.rx_fifo_errors++;
653                                 tunnel->dev->stats.rx_errors++;
654                                 goto drop;
655                         }
656                         tunnel->i_seqno = seqno + 1;
657                 }
658
659                 /* Warning: All skb pointers will be invalidated! */
660                 if (tunnel->dev->type == ARPHRD_ETHER) {
661                         if (!pskb_may_pull(skb, ETH_HLEN)) {
662                                 tunnel->dev->stats.rx_length_errors++;
663                                 tunnel->dev->stats.rx_errors++;
664                                 goto drop;
665                         }
666
667                         iph = ip_hdr(skb);
668                         skb->protocol = eth_type_trans(skb, tunnel->dev);
669                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
670                 }
671
672                 tstats = this_cpu_ptr(tunnel->dev->tstats);
673                 tstats->rx_packets++;
674                 tstats->rx_bytes += skb->len;
675
676                 __skb_tunnel_rx(skb, tunnel->dev);
677
678                 skb_reset_network_header(skb);
679                 ipgre_ecn_decapsulate(iph, skb);
680
681                 netif_rx(skb);
682
683                 rcu_read_unlock();
684                 return 0;
685         }
686         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
687
688 drop:
689         rcu_read_unlock();
690 drop_nolock:
691         kfree_skb(skb);
692         return 0;
693 }
694
695 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
696 {
697         struct ip_tunnel *tunnel = netdev_priv(dev);
698         struct pcpu_tstats *tstats;
699         const struct iphdr  *old_iph = ip_hdr(skb);
700         const struct iphdr  *tiph;
701         struct flowi4 fl4;
702         u8     tos;
703         __be16 df;
704         struct rtable *rt;                      /* Route to the other host */
705         struct net_device *tdev;                /* Device to other host */
706         struct iphdr  *iph;                     /* Our new IP header */
707         unsigned int max_headroom;              /* The extra header space needed */
708         int    gre_hlen;
709         __be32 dst;
710         int    mtu;
711
712         if (dev->type == ARPHRD_ETHER)
713                 IPCB(skb)->flags = 0;
714
715         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
716                 gre_hlen = 0;
717                 tiph = (const struct iphdr *)skb->data;
718         } else {
719                 gre_hlen = tunnel->hlen;
720                 tiph = &tunnel->parms.iph;
721         }
722
723         if ((dst = tiph->daddr) == 0) {
724                 /* NBMA tunnel */
725
726                 if (skb_dst(skb) == NULL) {
727                         dev->stats.tx_fifo_errors++;
728                         goto tx_error;
729                 }
730
731                 if (skb->protocol == htons(ETH_P_IP)) {
732                         rt = skb_rtable(skb);
733                         dst = rt->rt_gateway;
734                 }
735 #if IS_ENABLED(CONFIG_IPV6)
736                 else if (skb->protocol == htons(ETH_P_IPV6)) {
737                         const struct in6_addr *addr6;
738                         struct neighbour *neigh;
739                         bool do_tx_error_icmp;
740                         int addr_type;
741
742                         neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
743                         if (neigh == NULL)
744                                 goto tx_error;
745
746                         addr6 = (const struct in6_addr *)&neigh->primary_key;
747                         addr_type = ipv6_addr_type(addr6);
748
749                         if (addr_type == IPV6_ADDR_ANY) {
750                                 addr6 = &ipv6_hdr(skb)->daddr;
751                                 addr_type = ipv6_addr_type(addr6);
752                         }
753
754                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
755                                 do_tx_error_icmp = true;
756                         else {
757                                 do_tx_error_icmp = false;
758                                 dst = addr6->s6_addr32[3];
759                         }
760                         neigh_release(neigh);
761                         if (do_tx_error_icmp)
762                                 goto tx_error_icmp;
763                 }
764 #endif
765                 else
766                         goto tx_error;
767         }
768
769         tos = tiph->tos;
770         if (tos == 1) {
771                 tos = 0;
772                 if (skb->protocol == htons(ETH_P_IP))
773                         tos = old_iph->tos;
774                 else if (skb->protocol == htons(ETH_P_IPV6))
775                         tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
776         }
777
778         rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
779                                  tunnel->parms.o_key, RT_TOS(tos),
780                                  tunnel->parms.link);
781         if (IS_ERR(rt)) {
782                 dev->stats.tx_carrier_errors++;
783                 goto tx_error;
784         }
785         tdev = rt->dst.dev;
786
787         if (tdev == dev) {
788                 ip_rt_put(rt);
789                 dev->stats.collisions++;
790                 goto tx_error;
791         }
792
793         df = tiph->frag_off;
794         if (df)
795                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
796         else
797                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
798
799         if (skb_dst(skb))
800                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
801
802         if (skb->protocol == htons(ETH_P_IP)) {
803                 df |= (old_iph->frag_off&htons(IP_DF));
804
805                 if ((old_iph->frag_off&htons(IP_DF)) &&
806                     mtu < ntohs(old_iph->tot_len)) {
807                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
808                         ip_rt_put(rt);
809                         goto tx_error;
810                 }
811         }
812 #if IS_ENABLED(CONFIG_IPV6)
813         else if (skb->protocol == htons(ETH_P_IPV6)) {
814                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
815
816                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
817                         if ((tunnel->parms.iph.daddr &&
818                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
819                             rt6->rt6i_dst.plen == 128) {
820                                 rt6->rt6i_flags |= RTF_MODIFIED;
821                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
822                         }
823                 }
824
825                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
826                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
827                         ip_rt_put(rt);
828                         goto tx_error;
829                 }
830         }
831 #endif
832
833         if (tunnel->err_count > 0) {
834                 if (time_before(jiffies,
835                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
836                         tunnel->err_count--;
837
838                         dst_link_failure(skb);
839                 } else
840                         tunnel->err_count = 0;
841         }
842
843         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
844
845         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
846             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
847                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
848                 if (max_headroom > dev->needed_headroom)
849                         dev->needed_headroom = max_headroom;
850                 if (!new_skb) {
851                         ip_rt_put(rt);
852                         dev->stats.tx_dropped++;
853                         dev_kfree_skb(skb);
854                         return NETDEV_TX_OK;
855                 }
856                 if (skb->sk)
857                         skb_set_owner_w(new_skb, skb->sk);
858                 dev_kfree_skb(skb);
859                 skb = new_skb;
860                 old_iph = ip_hdr(skb);
861         }
862
863         skb_reset_transport_header(skb);
864         skb_push(skb, gre_hlen);
865         skb_reset_network_header(skb);
866         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
867         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
868                               IPSKB_REROUTED);
869         skb_dst_drop(skb);
870         skb_dst_set(skb, &rt->dst);
871
872         /*
873          *      Push down and install the IPIP header.
874          */
875
876         iph                     =       ip_hdr(skb);
877         iph->version            =       4;
878         iph->ihl                =       sizeof(struct iphdr) >> 2;
879         iph->frag_off           =       df;
880         iph->protocol           =       IPPROTO_GRE;
881         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
882         iph->daddr              =       fl4.daddr;
883         iph->saddr              =       fl4.saddr;
884
885         if ((iph->ttl = tiph->ttl) == 0) {
886                 if (skb->protocol == htons(ETH_P_IP))
887                         iph->ttl = old_iph->ttl;
888 #if IS_ENABLED(CONFIG_IPV6)
889                 else if (skb->protocol == htons(ETH_P_IPV6))
890                         iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
891 #endif
892                 else
893                         iph->ttl = ip4_dst_hoplimit(&rt->dst);
894         }
895
896         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
897         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
898                                    htons(ETH_P_TEB) : skb->protocol;
899
900         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
901                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
902
903                 if (tunnel->parms.o_flags&GRE_SEQ) {
904                         ++tunnel->o_seqno;
905                         *ptr = htonl(tunnel->o_seqno);
906                         ptr--;
907                 }
908                 if (tunnel->parms.o_flags&GRE_KEY) {
909                         *ptr = tunnel->parms.o_key;
910                         ptr--;
911                 }
912                 if (tunnel->parms.o_flags&GRE_CSUM) {
913                         *ptr = 0;
914                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
915                 }
916         }
917
918         nf_reset(skb);
919         tstats = this_cpu_ptr(dev->tstats);
920         __IPTUNNEL_XMIT(tstats, &dev->stats);
921         return NETDEV_TX_OK;
922
923 #if IS_ENABLED(CONFIG_IPV6)
924 tx_error_icmp:
925         dst_link_failure(skb);
926 #endif
927 tx_error:
928         dev->stats.tx_errors++;
929         dev_kfree_skb(skb);
930         return NETDEV_TX_OK;
931 }
932
933 static int ipgre_tunnel_bind_dev(struct net_device *dev)
934 {
935         struct net_device *tdev = NULL;
936         struct ip_tunnel *tunnel;
937         const struct iphdr *iph;
938         int hlen = LL_MAX_HEADER;
939         int mtu = ETH_DATA_LEN;
940         int addend = sizeof(struct iphdr) + 4;
941
942         tunnel = netdev_priv(dev);
943         iph = &tunnel->parms.iph;
944
945         /* Guess output device to choose reasonable mtu and needed_headroom */
946
947         if (iph->daddr) {
948                 struct flowi4 fl4;
949                 struct rtable *rt;
950
951                 rt = ip_route_output_gre(dev_net(dev), &fl4,
952                                          iph->daddr, iph->saddr,
953                                          tunnel->parms.o_key,
954                                          RT_TOS(iph->tos),
955                                          tunnel->parms.link);
956                 if (!IS_ERR(rt)) {
957                         tdev = rt->dst.dev;
958                         ip_rt_put(rt);
959                 }
960
961                 if (dev->type != ARPHRD_ETHER)
962                         dev->flags |= IFF_POINTOPOINT;
963         }
964
965         if (!tdev && tunnel->parms.link)
966                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
967
968         if (tdev) {
969                 hlen = tdev->hard_header_len + tdev->needed_headroom;
970                 mtu = tdev->mtu;
971         }
972         dev->iflink = tunnel->parms.link;
973
974         /* Precalculate GRE options length */
975         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
976                 if (tunnel->parms.o_flags&GRE_CSUM)
977                         addend += 4;
978                 if (tunnel->parms.o_flags&GRE_KEY)
979                         addend += 4;
980                 if (tunnel->parms.o_flags&GRE_SEQ)
981                         addend += 4;
982         }
983         dev->needed_headroom = addend + hlen;
984         mtu -= dev->hard_header_len + addend;
985
986         if (mtu < 68)
987                 mtu = 68;
988
989         tunnel->hlen = addend;
990
991         return mtu;
992 }
993
994 static int
995 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
996 {
997         int err = 0;
998         struct ip_tunnel_parm p;
999         struct ip_tunnel *t;
1000         struct net *net = dev_net(dev);
1001         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1002
1003         switch (cmd) {
1004         case SIOCGETTUNNEL:
1005                 t = NULL;
1006                 if (dev == ign->fb_tunnel_dev) {
1007                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1008                                 err = -EFAULT;
1009                                 break;
1010                         }
1011                         t = ipgre_tunnel_locate(net, &p, 0);
1012                 }
1013                 if (t == NULL)
1014                         t = netdev_priv(dev);
1015                 memcpy(&p, &t->parms, sizeof(p));
1016                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1017                         err = -EFAULT;
1018                 break;
1019
1020         case SIOCADDTUNNEL:
1021         case SIOCCHGTUNNEL:
1022                 err = -EPERM;
1023                 if (!capable(CAP_NET_ADMIN))
1024                         goto done;
1025
1026                 err = -EFAULT;
1027                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1028                         goto done;
1029
1030                 err = -EINVAL;
1031                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1032                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1033                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1034                         goto done;
1035                 if (p.iph.ttl)
1036                         p.iph.frag_off |= htons(IP_DF);
1037
1038                 if (!(p.i_flags&GRE_KEY))
1039                         p.i_key = 0;
1040                 if (!(p.o_flags&GRE_KEY))
1041                         p.o_key = 0;
1042
1043                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1044
1045                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1046                         if (t != NULL) {
1047                                 if (t->dev != dev) {
1048                                         err = -EEXIST;
1049                                         break;
1050                                 }
1051                         } else {
1052                                 unsigned int nflags = 0;
1053
1054                                 t = netdev_priv(dev);
1055
1056                                 if (ipv4_is_multicast(p.iph.daddr))
1057                                         nflags = IFF_BROADCAST;
1058                                 else if (p.iph.daddr)
1059                                         nflags = IFF_POINTOPOINT;
1060
1061                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1062                                         err = -EINVAL;
1063                                         break;
1064                                 }
1065                                 ipgre_tunnel_unlink(ign, t);
1066                                 synchronize_net();
1067                                 t->parms.iph.saddr = p.iph.saddr;
1068                                 t->parms.iph.daddr = p.iph.daddr;
1069                                 t->parms.i_key = p.i_key;
1070                                 t->parms.o_key = p.o_key;
1071                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1072                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1073                                 ipgre_tunnel_link(ign, t);
1074                                 netdev_state_change(dev);
1075                         }
1076                 }
1077
1078                 if (t) {
1079                         err = 0;
1080                         if (cmd == SIOCCHGTUNNEL) {
1081                                 t->parms.iph.ttl = p.iph.ttl;
1082                                 t->parms.iph.tos = p.iph.tos;
1083                                 t->parms.iph.frag_off = p.iph.frag_off;
1084                                 if (t->parms.link != p.link) {
1085                                         t->parms.link = p.link;
1086                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1087                                         netdev_state_change(dev);
1088                                 }
1089                         }
1090                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1091                                 err = -EFAULT;
1092                 } else
1093                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1094                 break;
1095
1096         case SIOCDELTUNNEL:
1097                 err = -EPERM;
1098                 if (!capable(CAP_NET_ADMIN))
1099                         goto done;
1100
1101                 if (dev == ign->fb_tunnel_dev) {
1102                         err = -EFAULT;
1103                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1104                                 goto done;
1105                         err = -ENOENT;
1106                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1107                                 goto done;
1108                         err = -EPERM;
1109                         if (t == netdev_priv(ign->fb_tunnel_dev))
1110                                 goto done;
1111                         dev = t->dev;
1112                 }
1113                 unregister_netdevice(dev);
1114                 err = 0;
1115                 break;
1116
1117         default:
1118                 err = -EINVAL;
1119         }
1120
1121 done:
1122         return err;
1123 }
1124
1125 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1126 {
1127         struct ip_tunnel *tunnel = netdev_priv(dev);
1128         if (new_mtu < 68 ||
1129             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1130                 return -EINVAL;
1131         dev->mtu = new_mtu;
1132         return 0;
1133 }
1134
1135 /* Nice toy. Unfortunately, useless in real life :-)
1136    It allows to construct virtual multiprotocol broadcast "LAN"
1137    over the Internet, provided multicast routing is tuned.
1138
1139
1140    I have no idea was this bicycle invented before me,
1141    so that I had to set ARPHRD_IPGRE to a random value.
1142    I have an impression, that Cisco could make something similar,
1143    but this feature is apparently missing in IOS<=11.2(8).
1144
1145    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1146    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1147
1148    ping -t 255 224.66.66.66
1149
1150    If nobody answers, mbone does not work.
1151
1152    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1153    ip addr add 10.66.66.<somewhat>/24 dev Universe
1154    ifconfig Universe up
1155    ifconfig Universe add fe80::<Your_real_addr>/10
1156    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1157    ftp 10.66.66.66
1158    ...
1159    ftp fec0:6666:6666::193.233.7.65
1160    ...
1161
1162  */
1163
1164 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1165                         unsigned short type,
1166                         const void *daddr, const void *saddr, unsigned int len)
1167 {
1168         struct ip_tunnel *t = netdev_priv(dev);
1169         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1170         __be16 *p = (__be16*)(iph+1);
1171
1172         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1173         p[0]            = t->parms.o_flags;
1174         p[1]            = htons(type);
1175
1176         /*
1177          *      Set the source hardware address.
1178          */
1179
1180         if (saddr)
1181                 memcpy(&iph->saddr, saddr, 4);
1182         if (daddr)
1183                 memcpy(&iph->daddr, daddr, 4);
1184         if (iph->daddr)
1185                 return t->hlen;
1186
1187         return -t->hlen;
1188 }
1189
1190 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1191 {
1192         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1193         memcpy(haddr, &iph->saddr, 4);
1194         return 4;
1195 }
1196
1197 static const struct header_ops ipgre_header_ops = {
1198         .create = ipgre_header,
1199         .parse  = ipgre_header_parse,
1200 };
1201
1202 #ifdef CONFIG_NET_IPGRE_BROADCAST
1203 static int ipgre_open(struct net_device *dev)
1204 {
1205         struct ip_tunnel *t = netdev_priv(dev);
1206
1207         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1208                 struct flowi4 fl4;
1209                 struct rtable *rt;
1210
1211                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1212                                          t->parms.iph.daddr,
1213                                          t->parms.iph.saddr,
1214                                          t->parms.o_key,
1215                                          RT_TOS(t->parms.iph.tos),
1216                                          t->parms.link);
1217                 if (IS_ERR(rt))
1218                         return -EADDRNOTAVAIL;
1219                 dev = rt->dst.dev;
1220                 ip_rt_put(rt);
1221                 if (__in_dev_get_rtnl(dev) == NULL)
1222                         return -EADDRNOTAVAIL;
1223                 t->mlink = dev->ifindex;
1224                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1225         }
1226         return 0;
1227 }
1228
1229 static int ipgre_close(struct net_device *dev)
1230 {
1231         struct ip_tunnel *t = netdev_priv(dev);
1232
1233         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1234                 struct in_device *in_dev;
1235                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1236                 if (in_dev)
1237                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1238         }
1239         return 0;
1240 }
1241
1242 #endif
1243
1244 static const struct net_device_ops ipgre_netdev_ops = {
1245         .ndo_init               = ipgre_tunnel_init,
1246         .ndo_uninit             = ipgre_tunnel_uninit,
1247 #ifdef CONFIG_NET_IPGRE_BROADCAST
1248         .ndo_open               = ipgre_open,
1249         .ndo_stop               = ipgre_close,
1250 #endif
1251         .ndo_start_xmit         = ipgre_tunnel_xmit,
1252         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1253         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1254         .ndo_get_stats          = ipgre_get_stats,
1255 };
1256
1257 static void ipgre_dev_free(struct net_device *dev)
1258 {
1259         free_percpu(dev->tstats);
1260         free_netdev(dev);
1261 }
1262
1263 static void ipgre_tunnel_setup(struct net_device *dev)
1264 {
1265         dev->netdev_ops         = &ipgre_netdev_ops;
1266         dev->destructor         = ipgre_dev_free;
1267
1268         dev->type               = ARPHRD_IPGRE;
1269         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1270         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1271         dev->flags              = IFF_NOARP;
1272         dev->iflink             = 0;
1273         dev->addr_len           = 4;
1274         dev->features           |= NETIF_F_NETNS_LOCAL;
1275         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1276 }
1277
1278 static int ipgre_tunnel_init(struct net_device *dev)
1279 {
1280         struct ip_tunnel *tunnel;
1281         struct iphdr *iph;
1282
1283         tunnel = netdev_priv(dev);
1284         iph = &tunnel->parms.iph;
1285
1286         tunnel->dev = dev;
1287         strcpy(tunnel->parms.name, dev->name);
1288
1289         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1290         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1291
1292         if (iph->daddr) {
1293 #ifdef CONFIG_NET_IPGRE_BROADCAST
1294                 if (ipv4_is_multicast(iph->daddr)) {
1295                         if (!iph->saddr)
1296                                 return -EINVAL;
1297                         dev->flags = IFF_BROADCAST;
1298                         dev->header_ops = &ipgre_header_ops;
1299                 }
1300 #endif
1301         } else
1302                 dev->header_ops = &ipgre_header_ops;
1303
1304         dev->tstats = alloc_percpu(struct pcpu_tstats);
1305         if (!dev->tstats)
1306                 return -ENOMEM;
1307
1308         return 0;
1309 }
1310
1311 static void ipgre_fb_tunnel_init(struct net_device *dev)
1312 {
1313         struct ip_tunnel *tunnel = netdev_priv(dev);
1314         struct iphdr *iph = &tunnel->parms.iph;
1315
1316         tunnel->dev = dev;
1317         strcpy(tunnel->parms.name, dev->name);
1318
1319         iph->version            = 4;
1320         iph->protocol           = IPPROTO_GRE;
1321         iph->ihl                = 5;
1322         tunnel->hlen            = sizeof(struct iphdr) + 4;
1323
1324         dev_hold(dev);
1325 }
1326
1327
1328 static const struct gre_protocol ipgre_protocol = {
1329         .handler     = ipgre_rcv,
1330         .err_handler = ipgre_err,
1331 };
1332
1333 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1334 {
1335         int prio;
1336
1337         for (prio = 0; prio < 4; prio++) {
1338                 int h;
1339                 for (h = 0; h < HASH_SIZE; h++) {
1340                         struct ip_tunnel *t;
1341
1342                         t = rtnl_dereference(ign->tunnels[prio][h]);
1343
1344                         while (t != NULL) {
1345                                 unregister_netdevice_queue(t->dev, head);
1346                                 t = rtnl_dereference(t->next);
1347                         }
1348                 }
1349         }
1350 }
1351
1352 static int __net_init ipgre_init_net(struct net *net)
1353 {
1354         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1355         int err;
1356
1357         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1358                                            ipgre_tunnel_setup);
1359         if (!ign->fb_tunnel_dev) {
1360                 err = -ENOMEM;
1361                 goto err_alloc_dev;
1362         }
1363         dev_net_set(ign->fb_tunnel_dev, net);
1364
1365         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1366         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1367
1368         if ((err = register_netdev(ign->fb_tunnel_dev)))
1369                 goto err_reg_dev;
1370
1371         rcu_assign_pointer(ign->tunnels_wc[0],
1372                            netdev_priv(ign->fb_tunnel_dev));
1373         return 0;
1374
1375 err_reg_dev:
1376         ipgre_dev_free(ign->fb_tunnel_dev);
1377 err_alloc_dev:
1378         return err;
1379 }
1380
1381 static void __net_exit ipgre_exit_net(struct net *net)
1382 {
1383         struct ipgre_net *ign;
1384         LIST_HEAD(list);
1385
1386         ign = net_generic(net, ipgre_net_id);
1387         rtnl_lock();
1388         ipgre_destroy_tunnels(ign, &list);
1389         unregister_netdevice_many(&list);
1390         rtnl_unlock();
1391 }
1392
1393 static struct pernet_operations ipgre_net_ops = {
1394         .init = ipgre_init_net,
1395         .exit = ipgre_exit_net,
1396         .id   = &ipgre_net_id,
1397         .size = sizeof(struct ipgre_net),
1398 };
1399
1400 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1401 {
1402         __be16 flags;
1403
1404         if (!data)
1405                 return 0;
1406
1407         flags = 0;
1408         if (data[IFLA_GRE_IFLAGS])
1409                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1410         if (data[IFLA_GRE_OFLAGS])
1411                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1412         if (flags & (GRE_VERSION|GRE_ROUTING))
1413                 return -EINVAL;
1414
1415         return 0;
1416 }
1417
1418 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1419 {
1420         __be32 daddr;
1421
1422         if (tb[IFLA_ADDRESS]) {
1423                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1424                         return -EINVAL;
1425                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1426                         return -EADDRNOTAVAIL;
1427         }
1428
1429         if (!data)
1430                 goto out;
1431
1432         if (data[IFLA_GRE_REMOTE]) {
1433                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1434                 if (!daddr)
1435                         return -EINVAL;
1436         }
1437
1438 out:
1439         return ipgre_tunnel_validate(tb, data);
1440 }
1441
1442 static void ipgre_netlink_parms(struct nlattr *data[],
1443                                 struct ip_tunnel_parm *parms)
1444 {
1445         memset(parms, 0, sizeof(*parms));
1446
1447         parms->iph.protocol = IPPROTO_GRE;
1448
1449         if (!data)
1450                 return;
1451
1452         if (data[IFLA_GRE_LINK])
1453                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1454
1455         if (data[IFLA_GRE_IFLAGS])
1456                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1457
1458         if (data[IFLA_GRE_OFLAGS])
1459                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1460
1461         if (data[IFLA_GRE_IKEY])
1462                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1463
1464         if (data[IFLA_GRE_OKEY])
1465                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1466
1467         if (data[IFLA_GRE_LOCAL])
1468                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1469
1470         if (data[IFLA_GRE_REMOTE])
1471                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1472
1473         if (data[IFLA_GRE_TTL])
1474                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1475
1476         if (data[IFLA_GRE_TOS])
1477                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1478
1479         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1480                 parms->iph.frag_off = htons(IP_DF);
1481 }
1482
1483 static int ipgre_tap_init(struct net_device *dev)
1484 {
1485         struct ip_tunnel *tunnel;
1486
1487         tunnel = netdev_priv(dev);
1488
1489         tunnel->dev = dev;
1490         strcpy(tunnel->parms.name, dev->name);
1491
1492         ipgre_tunnel_bind_dev(dev);
1493
1494         dev->tstats = alloc_percpu(struct pcpu_tstats);
1495         if (!dev->tstats)
1496                 return -ENOMEM;
1497
1498         return 0;
1499 }
1500
1501 static const struct net_device_ops ipgre_tap_netdev_ops = {
1502         .ndo_init               = ipgre_tap_init,
1503         .ndo_uninit             = ipgre_tunnel_uninit,
1504         .ndo_start_xmit         = ipgre_tunnel_xmit,
1505         .ndo_set_mac_address    = eth_mac_addr,
1506         .ndo_validate_addr      = eth_validate_addr,
1507         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1508         .ndo_get_stats          = ipgre_get_stats,
1509 };
1510
1511 static void ipgre_tap_setup(struct net_device *dev)
1512 {
1513
1514         ether_setup(dev);
1515
1516         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1517         dev->destructor         = ipgre_dev_free;
1518
1519         dev->iflink             = 0;
1520         dev->features           |= NETIF_F_NETNS_LOCAL;
1521 }
1522
1523 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1524                          struct nlattr *data[])
1525 {
1526         struct ip_tunnel *nt;
1527         struct net *net = dev_net(dev);
1528         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1529         int mtu;
1530         int err;
1531
1532         nt = netdev_priv(dev);
1533         ipgre_netlink_parms(data, &nt->parms);
1534
1535         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1536                 return -EEXIST;
1537
1538         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1539                 eth_hw_addr_random(dev);
1540
1541         mtu = ipgre_tunnel_bind_dev(dev);
1542         if (!tb[IFLA_MTU])
1543                 dev->mtu = mtu;
1544
1545         /* Can use a lockless transmit, unless we generate output sequences */
1546         if (!(nt->parms.o_flags & GRE_SEQ))
1547                 dev->features |= NETIF_F_LLTX;
1548
1549         err = register_netdevice(dev);
1550         if (err)
1551                 goto out;
1552
1553         dev_hold(dev);
1554         ipgre_tunnel_link(ign, nt);
1555
1556 out:
1557         return err;
1558 }
1559
1560 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1561                             struct nlattr *data[])
1562 {
1563         struct ip_tunnel *t, *nt;
1564         struct net *net = dev_net(dev);
1565         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1566         struct ip_tunnel_parm p;
1567         int mtu;
1568
1569         if (dev == ign->fb_tunnel_dev)
1570                 return -EINVAL;
1571
1572         nt = netdev_priv(dev);
1573         ipgre_netlink_parms(data, &p);
1574
1575         t = ipgre_tunnel_locate(net, &p, 0);
1576
1577         if (t) {
1578                 if (t->dev != dev)
1579                         return -EEXIST;
1580         } else {
1581                 t = nt;
1582
1583                 if (dev->type != ARPHRD_ETHER) {
1584                         unsigned int nflags = 0;
1585
1586                         if (ipv4_is_multicast(p.iph.daddr))
1587                                 nflags = IFF_BROADCAST;
1588                         else if (p.iph.daddr)
1589                                 nflags = IFF_POINTOPOINT;
1590
1591                         if ((dev->flags ^ nflags) &
1592                             (IFF_POINTOPOINT | IFF_BROADCAST))
1593                                 return -EINVAL;
1594                 }
1595
1596                 ipgre_tunnel_unlink(ign, t);
1597                 t->parms.iph.saddr = p.iph.saddr;
1598                 t->parms.iph.daddr = p.iph.daddr;
1599                 t->parms.i_key = p.i_key;
1600                 if (dev->type != ARPHRD_ETHER) {
1601                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1602                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1603                 }
1604                 ipgre_tunnel_link(ign, t);
1605                 netdev_state_change(dev);
1606         }
1607
1608         t->parms.o_key = p.o_key;
1609         t->parms.iph.ttl = p.iph.ttl;
1610         t->parms.iph.tos = p.iph.tos;
1611         t->parms.iph.frag_off = p.iph.frag_off;
1612
1613         if (t->parms.link != p.link) {
1614                 t->parms.link = p.link;
1615                 mtu = ipgre_tunnel_bind_dev(dev);
1616                 if (!tb[IFLA_MTU])
1617                         dev->mtu = mtu;
1618                 netdev_state_change(dev);
1619         }
1620
1621         return 0;
1622 }
1623
1624 static size_t ipgre_get_size(const struct net_device *dev)
1625 {
1626         return
1627                 /* IFLA_GRE_LINK */
1628                 nla_total_size(4) +
1629                 /* IFLA_GRE_IFLAGS */
1630                 nla_total_size(2) +
1631                 /* IFLA_GRE_OFLAGS */
1632                 nla_total_size(2) +
1633                 /* IFLA_GRE_IKEY */
1634                 nla_total_size(4) +
1635                 /* IFLA_GRE_OKEY */
1636                 nla_total_size(4) +
1637                 /* IFLA_GRE_LOCAL */
1638                 nla_total_size(4) +
1639                 /* IFLA_GRE_REMOTE */
1640                 nla_total_size(4) +
1641                 /* IFLA_GRE_TTL */
1642                 nla_total_size(1) +
1643                 /* IFLA_GRE_TOS */
1644                 nla_total_size(1) +
1645                 /* IFLA_GRE_PMTUDISC */
1646                 nla_total_size(1) +
1647                 0;
1648 }
1649
1650 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1651 {
1652         struct ip_tunnel *t = netdev_priv(dev);
1653         struct ip_tunnel_parm *p = &t->parms;
1654
1655         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1656         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1657         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1658         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1659         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1660         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1661         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1662         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1663         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1664         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1665
1666         return 0;
1667
1668 nla_put_failure:
1669         return -EMSGSIZE;
1670 }
1671
1672 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1673         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1674         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1675         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1676         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1677         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1678         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1679         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1680         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1681         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1682         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1683 };
1684
1685 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1686         .kind           = "gre",
1687         .maxtype        = IFLA_GRE_MAX,
1688         .policy         = ipgre_policy,
1689         .priv_size      = sizeof(struct ip_tunnel),
1690         .setup          = ipgre_tunnel_setup,
1691         .validate       = ipgre_tunnel_validate,
1692         .newlink        = ipgre_newlink,
1693         .changelink     = ipgre_changelink,
1694         .get_size       = ipgre_get_size,
1695         .fill_info      = ipgre_fill_info,
1696 };
1697
1698 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1699         .kind           = "gretap",
1700         .maxtype        = IFLA_GRE_MAX,
1701         .policy         = ipgre_policy,
1702         .priv_size      = sizeof(struct ip_tunnel),
1703         .setup          = ipgre_tap_setup,
1704         .validate       = ipgre_tap_validate,
1705         .newlink        = ipgre_newlink,
1706         .changelink     = ipgre_changelink,
1707         .get_size       = ipgre_get_size,
1708         .fill_info      = ipgre_fill_info,
1709 };
1710
1711 /*
1712  *      And now the modules code and kernel interface.
1713  */
1714
1715 static int __init ipgre_init(void)
1716 {
1717         int err;
1718
1719         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1720
1721         err = register_pernet_device(&ipgre_net_ops);
1722         if (err < 0)
1723                 return err;
1724
1725         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1726         if (err < 0) {
1727                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1728                 goto add_proto_failed;
1729         }
1730
1731         err = rtnl_link_register(&ipgre_link_ops);
1732         if (err < 0)
1733                 goto rtnl_link_failed;
1734
1735         err = rtnl_link_register(&ipgre_tap_ops);
1736         if (err < 0)
1737                 goto tap_ops_failed;
1738
1739 out:
1740         return err;
1741
1742 tap_ops_failed:
1743         rtnl_link_unregister(&ipgre_link_ops);
1744 rtnl_link_failed:
1745         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1746 add_proto_failed:
1747         unregister_pernet_device(&ipgre_net_ops);
1748         goto out;
1749 }
1750
1751 static void __exit ipgre_fini(void)
1752 {
1753         rtnl_link_unregister(&ipgre_tap_ops);
1754         rtnl_link_unregister(&ipgre_link_ops);
1755         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1756                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1757         unregister_pernet_device(&ipgre_net_ops);
1758 }
1759
1760 module_init(ipgre_init);
1761 module_exit(ipgre_fini);
1762 MODULE_LICENSE("GPL");
1763 MODULE_ALIAS_RTNL_LINK("gre");
1764 MODULE_ALIAS_RTNL_LINK("gretap");
1765 MODULE_ALIAS_NETDEV("gre0");