Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec
[firefly-linux-kernel-4.4.55.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
60 {
61         struct dst_entry *dst = skb_dst(skb);
62         struct net_device *dev = dst->dev;
63         struct neighbour *neigh;
64         struct in6_addr *nexthop;
65         int ret;
66
67         skb->protocol = htons(ETH_P_IPV6);
68         skb->dev = dev;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_socket(dev_net(dev), skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(dev_net(dev), idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97                                 skb->len);
98
99                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100                     IPV6_ADDR_SCOPE_NODELOCAL &&
101                     !(dev->flags & IFF_LOOPBACK)) {
102                         kfree_skb(skb);
103                         return 0;
104                 }
105         }
106
107         rcu_read_lock_bh();
108         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110         if (unlikely(!neigh))
111                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112         if (!IS_ERR(neigh)) {
113                 ret = dst_neigh_output(dst, neigh, skb);
114                 rcu_read_unlock_bh();
115                 return ret;
116         }
117         rcu_read_unlock_bh();
118
119         IP6_INC_STATS(dev_net(dst->dev),
120                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121         kfree_skb(skb);
122         return -EINVAL;
123 }
124
125 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
126 {
127         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128             dst_allfrag(skb_dst(skb)) ||
129             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130                 return ip6_fragment(sk, skb, ip6_finish_output2);
131         else
132                 return ip6_finish_output2(sk, skb);
133 }
134
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137         struct net_device *dev = skb_dst(skb)->dev;
138         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139         if (unlikely(idev->cnf.disable_ipv6)) {
140                 IP6_INC_STATS(dev_net(dev), idev,
141                               IPSTATS_MIB_OUTDISCARDS);
142                 kfree_skb(skb);
143                 return 0;
144         }
145
146         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
147                             NULL, dev,
148                             ip6_finish_output,
149                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
150 }
151
152 /*
153  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
154  */
155
156 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157              struct ipv6_txoptions *opt, int tclass)
158 {
159         struct net *net = sock_net(sk);
160         struct ipv6_pinfo *np = inet6_sk(sk);
161         struct in6_addr *first_hop = &fl6->daddr;
162         struct dst_entry *dst = skb_dst(skb);
163         struct ipv6hdr *hdr;
164         u8  proto = fl6->flowi6_proto;
165         int seg_len = skb->len;
166         int hlimit = -1;
167         u32 mtu;
168
169         if (opt) {
170                 unsigned int head_room;
171
172                 /* First: exthdrs may take lots of space (~8K for now)
173                    MAX_HEADER is not enough.
174                  */
175                 head_room = opt->opt_nflen + opt->opt_flen;
176                 seg_len += head_room;
177                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178
179                 if (skb_headroom(skb) < head_room) {
180                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181                         if (!skb2) {
182                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183                                               IPSTATS_MIB_OUTDISCARDS);
184                                 kfree_skb(skb);
185                                 return -ENOBUFS;
186                         }
187                         consume_skb(skb);
188                         skb = skb2;
189                         skb_set_owner_w(skb, sk);
190                 }
191                 if (opt->opt_flen)
192                         ipv6_push_frag_opts(skb, opt, &proto);
193                 if (opt->opt_nflen)
194                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195         }
196
197         skb_push(skb, sizeof(struct ipv6hdr));
198         skb_reset_network_header(skb);
199         hdr = ipv6_hdr(skb);
200
201         /*
202          *      Fill in the IPv6 header
203          */
204         if (np)
205                 hlimit = np->hop_limit;
206         if (hlimit < 0)
207                 hlimit = ip6_dst_hoplimit(dst);
208
209         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
210                                                      np->autoflowlabel, fl6));
211
212         hdr->payload_len = htons(seg_len);
213         hdr->nexthdr = proto;
214         hdr->hop_limit = hlimit;
215
216         hdr->saddr = fl6->saddr;
217         hdr->daddr = *first_hop;
218
219         skb->protocol = htons(ETH_P_IPV6);
220         skb->priority = sk->sk_priority;
221         skb->mark = sk->sk_mark;
222
223         mtu = dst_mtu(dst);
224         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
225                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
226                               IPSTATS_MIB_OUT, skb->len);
227                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
228                                NULL, dst->dev, dst_output_sk);
229         }
230
231         skb->dev = dst->dev;
232         ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
233         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234         kfree_skb(skb);
235         return -EMSGSIZE;
236 }
237 EXPORT_SYMBOL(ip6_xmit);
238
239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240 {
241         struct ip6_ra_chain *ra;
242         struct sock *last = NULL;
243
244         read_lock(&ip6_ra_lock);
245         for (ra = ip6_ra_chain; ra; ra = ra->next) {
246                 struct sock *sk = ra->sk;
247                 if (sk && ra->sel == sel &&
248                     (!sk->sk_bound_dev_if ||
249                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
250                         if (last) {
251                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252                                 if (skb2)
253                                         rawv6_rcv(last, skb2);
254                         }
255                         last = sk;
256                 }
257         }
258
259         if (last) {
260                 rawv6_rcv(last, skb);
261                 read_unlock(&ip6_ra_lock);
262                 return 1;
263         }
264         read_unlock(&ip6_ra_lock);
265         return 0;
266 }
267
268 static int ip6_forward_proxy_check(struct sk_buff *skb)
269 {
270         struct ipv6hdr *hdr = ipv6_hdr(skb);
271         u8 nexthdr = hdr->nexthdr;
272         __be16 frag_off;
273         int offset;
274
275         if (ipv6_ext_hdr(nexthdr)) {
276                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277                 if (offset < 0)
278                         return 0;
279         } else
280                 offset = sizeof(struct ipv6hdr);
281
282         if (nexthdr == IPPROTO_ICMPV6) {
283                 struct icmp6hdr *icmp6;
284
285                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
286                                          offset + 1 - skb->data)))
287                         return 0;
288
289                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
290
291                 switch (icmp6->icmp6_type) {
292                 case NDISC_ROUTER_SOLICITATION:
293                 case NDISC_ROUTER_ADVERTISEMENT:
294                 case NDISC_NEIGHBOUR_SOLICITATION:
295                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
296                 case NDISC_REDIRECT:
297                         /* For reaction involving unicast neighbor discovery
298                          * message destined to the proxied address, pass it to
299                          * input function.
300                          */
301                         return 1;
302                 default:
303                         break;
304                 }
305         }
306
307         /*
308          * The proxying router can't forward traffic sent to a link-local
309          * address, so signal the sender and discard the packet. This
310          * behavior is clarified by the MIPv6 specification.
311          */
312         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313                 dst_link_failure(skb);
314                 return -1;
315         }
316
317         return 0;
318 }
319
320 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
321 {
322         skb_sender_cpu_clear(skb);
323         return dst_output_sk(sk, skb);
324 }
325
326 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
327 {
328         unsigned int mtu;
329         struct inet6_dev *idev;
330
331         if (dst_metric_locked(dst, RTAX_MTU)) {
332                 mtu = dst_metric_raw(dst, RTAX_MTU);
333                 if (mtu)
334                         return mtu;
335         }
336
337         mtu = IPV6_MIN_MTU;
338         rcu_read_lock();
339         idev = __in6_dev_get(dst->dev);
340         if (idev)
341                 mtu = idev->cnf.mtu6;
342         rcu_read_unlock();
343
344         return mtu;
345 }
346
347 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
348 {
349         if (skb->len <= mtu)
350                 return false;
351
352         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
353         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
354                 return true;
355
356         if (skb->ignore_df)
357                 return false;
358
359         if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
360                 return false;
361
362         return true;
363 }
364
365 int ip6_forward(struct sk_buff *skb)
366 {
367         struct dst_entry *dst = skb_dst(skb);
368         struct ipv6hdr *hdr = ipv6_hdr(skb);
369         struct inet6_skb_parm *opt = IP6CB(skb);
370         struct net *net = dev_net(dst->dev);
371         u32 mtu;
372
373         if (net->ipv6.devconf_all->forwarding == 0)
374                 goto error;
375
376         if (skb->pkt_type != PACKET_HOST)
377                 goto drop;
378
379         if (unlikely(skb->sk))
380                 goto drop;
381
382         if (skb_warn_if_lro(skb))
383                 goto drop;
384
385         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
386                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
387                                  IPSTATS_MIB_INDISCARDS);
388                 goto drop;
389         }
390
391         skb_forward_csum(skb);
392
393         /*
394          *      We DO NOT make any processing on
395          *      RA packets, pushing them to user level AS IS
396          *      without ane WARRANTY that application will be able
397          *      to interpret them. The reason is that we
398          *      cannot make anything clever here.
399          *
400          *      We are not end-node, so that if packet contains
401          *      AH/ESP, we cannot make anything.
402          *      Defragmentation also would be mistake, RA packets
403          *      cannot be fragmented, because there is no warranty
404          *      that different fragments will go along one path. --ANK
405          */
406         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
407                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
408                         return 0;
409         }
410
411         /*
412          *      check and decrement ttl
413          */
414         if (hdr->hop_limit <= 1) {
415                 /* Force OUTPUT device used as source address */
416                 skb->dev = dst->dev;
417                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
419                                  IPSTATS_MIB_INHDRERRORS);
420
421                 kfree_skb(skb);
422                 return -ETIMEDOUT;
423         }
424
425         /* XXX: idev->cnf.proxy_ndp? */
426         if (net->ipv6.devconf_all->proxy_ndp &&
427             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428                 int proxied = ip6_forward_proxy_check(skb);
429                 if (proxied > 0)
430                         return ip6_input(skb);
431                 else if (proxied < 0) {
432                         IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
433                                          IPSTATS_MIB_INDISCARDS);
434                         goto drop;
435                 }
436         }
437
438         if (!xfrm6_route_forward(skb)) {
439                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
440                                  IPSTATS_MIB_INDISCARDS);
441                 goto drop;
442         }
443         dst = skb_dst(skb);
444
445         /* IPv6 specs say nothing about it, but it is clear that we cannot
446            send redirects to source routed frames.
447            We don't send redirects to frames decapsulated from IPsec.
448          */
449         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
450                 struct in6_addr *target = NULL;
451                 struct inet_peer *peer;
452                 struct rt6_info *rt;
453
454                 /*
455                  *      incoming and outgoing devices are the same
456                  *      send a redirect.
457                  */
458
459                 rt = (struct rt6_info *) dst;
460                 if (rt->rt6i_flags & RTF_GATEWAY)
461                         target = &rt->rt6i_gateway;
462                 else
463                         target = &hdr->daddr;
464
465                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
466
467                 /* Limit redirects both by destination (here)
468                    and by source (inside ndisc_send_redirect)
469                  */
470                 if (inet_peer_xrlim_allow(peer, 1*HZ))
471                         ndisc_send_redirect(skb, target);
472                 if (peer)
473                         inet_putpeer(peer);
474         } else {
475                 int addrtype = ipv6_addr_type(&hdr->saddr);
476
477                 /* This check is security critical. */
478                 if (addrtype == IPV6_ADDR_ANY ||
479                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
480                         goto error;
481                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
482                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
483                                     ICMPV6_NOT_NEIGHBOUR, 0);
484                         goto error;
485                 }
486         }
487
488         mtu = ip6_dst_mtu_forward(dst);
489         if (mtu < IPV6_MIN_MTU)
490                 mtu = IPV6_MIN_MTU;
491
492         if (ip6_pkt_too_big(skb, mtu)) {
493                 /* Again, force OUTPUT device used as source address */
494                 skb->dev = dst->dev;
495                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
497                                  IPSTATS_MIB_INTOOBIGERRORS);
498                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
499                                  IPSTATS_MIB_FRAGFAILS);
500                 kfree_skb(skb);
501                 return -EMSGSIZE;
502         }
503
504         if (skb_cow(skb, dst->dev->hard_header_len)) {
505                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
506                                  IPSTATS_MIB_OUTDISCARDS);
507                 goto drop;
508         }
509
510         hdr = ipv6_hdr(skb);
511
512         /* Mangling hops number delayed to point after skb COW */
513
514         hdr->hop_limit--;
515
516         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
517         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
518         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
519                        skb->dev, dst->dev,
520                        ip6_forward_finish);
521
522 error:
523         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
524 drop:
525         kfree_skb(skb);
526         return -EINVAL;
527 }
528
529 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
530 {
531         to->pkt_type = from->pkt_type;
532         to->priority = from->priority;
533         to->protocol = from->protocol;
534         skb_dst_drop(to);
535         skb_dst_set(to, dst_clone(skb_dst(from)));
536         to->dev = from->dev;
537         to->mark = from->mark;
538
539 #ifdef CONFIG_NET_SCHED
540         to->tc_index = from->tc_index;
541 #endif
542         nf_copy(to, from);
543         skb_copy_secmark(to, from);
544 }
545
546 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
547                  int (*output)(struct sock *, struct sk_buff *))
548 {
549         struct sk_buff *frag;
550         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
551         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
552                                 inet6_sk(skb->sk) : NULL;
553         struct ipv6hdr *tmp_hdr;
554         struct frag_hdr *fh;
555         unsigned int mtu, hlen, left, len;
556         int hroom, troom;
557         __be32 frag_id;
558         int ptr, offset = 0, err = 0;
559         u8 *prevhdr, nexthdr = 0;
560         struct net *net = dev_net(skb_dst(skb)->dev);
561
562         hlen = ip6_find_1stfragopt(skb, &prevhdr);
563         nexthdr = *prevhdr;
564
565         mtu = ip6_skb_dst_mtu(skb);
566
567         /* We must not fragment if the socket is set to force MTU discovery
568          * or if the skb it not generated by a local socket.
569          */
570         if (unlikely(!skb->ignore_df && skb->len > mtu))
571                 goto fail_toobig;
572
573         if (IP6CB(skb)->frag_max_size) {
574                 if (IP6CB(skb)->frag_max_size > mtu)
575                         goto fail_toobig;
576
577                 /* don't send fragments larger than what we received */
578                 mtu = IP6CB(skb)->frag_max_size;
579                 if (mtu < IPV6_MIN_MTU)
580                         mtu = IPV6_MIN_MTU;
581         }
582
583         if (np && np->frag_size < mtu) {
584                 if (np->frag_size)
585                         mtu = np->frag_size;
586         }
587         mtu -= hlen + sizeof(struct frag_hdr);
588
589         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
590                                     &ipv6_hdr(skb)->saddr);
591
592         hroom = LL_RESERVED_SPACE(rt->dst.dev);
593         if (skb_has_frag_list(skb)) {
594                 int first_len = skb_pagelen(skb);
595                 struct sk_buff *frag2;
596
597                 if (first_len - hlen > mtu ||
598                     ((first_len - hlen) & 7) ||
599                     skb_cloned(skb) ||
600                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
601                         goto slow_path;
602
603                 skb_walk_frags(skb, frag) {
604                         /* Correct geometry. */
605                         if (frag->len > mtu ||
606                             ((frag->len & 7) && frag->next) ||
607                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
608                                 goto slow_path_clean;
609
610                         /* Partially cloned skb? */
611                         if (skb_shared(frag))
612                                 goto slow_path_clean;
613
614                         BUG_ON(frag->sk);
615                         if (skb->sk) {
616                                 frag->sk = skb->sk;
617                                 frag->destructor = sock_wfree;
618                         }
619                         skb->truesize -= frag->truesize;
620                 }
621
622                 err = 0;
623                 offset = 0;
624                 /* BUILD HEADER */
625
626                 *prevhdr = NEXTHDR_FRAGMENT;
627                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
628                 if (!tmp_hdr) {
629                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
630                                       IPSTATS_MIB_FRAGFAILS);
631                         err = -ENOMEM;
632                         goto fail;
633                 }
634                 frag = skb_shinfo(skb)->frag_list;
635                 skb_frag_list_init(skb);
636
637                 __skb_pull(skb, hlen);
638                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
639                 __skb_push(skb, hlen);
640                 skb_reset_network_header(skb);
641                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
642
643                 fh->nexthdr = nexthdr;
644                 fh->reserved = 0;
645                 fh->frag_off = htons(IP6_MF);
646                 fh->identification = frag_id;
647
648                 first_len = skb_pagelen(skb);
649                 skb->data_len = first_len - skb_headlen(skb);
650                 skb->len = first_len;
651                 ipv6_hdr(skb)->payload_len = htons(first_len -
652                                                    sizeof(struct ipv6hdr));
653
654                 dst_hold(&rt->dst);
655
656                 for (;;) {
657                         /* Prepare header of the next frame,
658                          * before previous one went down. */
659                         if (frag) {
660                                 frag->ip_summed = CHECKSUM_NONE;
661                                 skb_reset_transport_header(frag);
662                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
663                                 __skb_push(frag, hlen);
664                                 skb_reset_network_header(frag);
665                                 memcpy(skb_network_header(frag), tmp_hdr,
666                                        hlen);
667                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
668                                 fh->nexthdr = nexthdr;
669                                 fh->reserved = 0;
670                                 fh->frag_off = htons(offset);
671                                 if (frag->next)
672                                         fh->frag_off |= htons(IP6_MF);
673                                 fh->identification = frag_id;
674                                 ipv6_hdr(frag)->payload_len =
675                                                 htons(frag->len -
676                                                       sizeof(struct ipv6hdr));
677                                 ip6_copy_metadata(frag, skb);
678                         }
679
680                         err = output(sk, skb);
681                         if (!err)
682                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
683                                               IPSTATS_MIB_FRAGCREATES);
684
685                         if (err || !frag)
686                                 break;
687
688                         skb = frag;
689                         frag = skb->next;
690                         skb->next = NULL;
691                 }
692
693                 kfree(tmp_hdr);
694
695                 if (err == 0) {
696                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
697                                       IPSTATS_MIB_FRAGOKS);
698                         ip6_rt_put(rt);
699                         return 0;
700                 }
701
702                 kfree_skb_list(frag);
703
704                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
705                               IPSTATS_MIB_FRAGFAILS);
706                 ip6_rt_put(rt);
707                 return err;
708
709 slow_path_clean:
710                 skb_walk_frags(skb, frag2) {
711                         if (frag2 == frag)
712                                 break;
713                         frag2->sk = NULL;
714                         frag2->destructor = NULL;
715                         skb->truesize += frag2->truesize;
716                 }
717         }
718
719 slow_path:
720         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
721             skb_checksum_help(skb))
722                 goto fail;
723
724         left = skb->len - hlen;         /* Space per frame */
725         ptr = hlen;                     /* Where to start from */
726
727         /*
728          *      Fragment the datagram.
729          */
730
731         *prevhdr = NEXTHDR_FRAGMENT;
732         troom = rt->dst.dev->needed_tailroom;
733
734         /*
735          *      Keep copying data until we run out.
736          */
737         while (left > 0)        {
738                 len = left;
739                 /* IF: it doesn't fit, use 'mtu' - the data space left */
740                 if (len > mtu)
741                         len = mtu;
742                 /* IF: we are not sending up to and including the packet end
743                    then align the next start on an eight byte boundary */
744                 if (len < left) {
745                         len &= ~7;
746                 }
747
748                 /* Allocate buffer */
749                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
750                                  hroom + troom, GFP_ATOMIC);
751                 if (!frag) {
752                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
753                                       IPSTATS_MIB_FRAGFAILS);
754                         err = -ENOMEM;
755                         goto fail;
756                 }
757
758                 /*
759                  *      Set up data on packet
760                  */
761
762                 ip6_copy_metadata(frag, skb);
763                 skb_reserve(frag, hroom);
764                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
765                 skb_reset_network_header(frag);
766                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
767                 frag->transport_header = (frag->network_header + hlen +
768                                           sizeof(struct frag_hdr));
769
770                 /*
771                  *      Charge the memory for the fragment to any owner
772                  *      it might possess
773                  */
774                 if (skb->sk)
775                         skb_set_owner_w(frag, skb->sk);
776
777                 /*
778                  *      Copy the packet header into the new buffer.
779                  */
780                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
781
782                 /*
783                  *      Build fragment header.
784                  */
785                 fh->nexthdr = nexthdr;
786                 fh->reserved = 0;
787                 fh->identification = frag_id;
788
789                 /*
790                  *      Copy a block of the IP datagram.
791                  */
792                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
793                                      len));
794                 left -= len;
795
796                 fh->frag_off = htons(offset);
797                 if (left > 0)
798                         fh->frag_off |= htons(IP6_MF);
799                 ipv6_hdr(frag)->payload_len = htons(frag->len -
800                                                     sizeof(struct ipv6hdr));
801
802                 ptr += len;
803                 offset += len;
804
805                 /*
806                  *      Put this fragment into the sending queue.
807                  */
808                 err = output(sk, frag);
809                 if (err)
810                         goto fail;
811
812                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
813                               IPSTATS_MIB_FRAGCREATES);
814         }
815         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
816                       IPSTATS_MIB_FRAGOKS);
817         consume_skb(skb);
818         return err;
819
820 fail_toobig:
821         if (skb->sk && dst_allfrag(skb_dst(skb)))
822                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
823
824         skb->dev = skb_dst(skb)->dev;
825         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
826         err = -EMSGSIZE;
827
828 fail:
829         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
830                       IPSTATS_MIB_FRAGFAILS);
831         kfree_skb(skb);
832         return err;
833 }
834
835 static inline int ip6_rt_check(const struct rt6key *rt_key,
836                                const struct in6_addr *fl_addr,
837                                const struct in6_addr *addr_cache)
838 {
839         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
840                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
841 }
842
843 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
844                                           struct dst_entry *dst,
845                                           const struct flowi6 *fl6)
846 {
847         struct ipv6_pinfo *np = inet6_sk(sk);
848         struct rt6_info *rt;
849
850         if (!dst)
851                 goto out;
852
853         if (dst->ops->family != AF_INET6) {
854                 dst_release(dst);
855                 return NULL;
856         }
857
858         rt = (struct rt6_info *)dst;
859         /* Yes, checking route validity in not connected
860          * case is not very simple. Take into account,
861          * that we do not support routing by source, TOS,
862          * and MSG_DONTROUTE            --ANK (980726)
863          *
864          * 1. ip6_rt_check(): If route was host route,
865          *    check that cached destination is current.
866          *    If it is network route, we still may
867          *    check its validity using saved pointer
868          *    to the last used address: daddr_cache.
869          *    We do not want to save whole address now,
870          *    (because main consumer of this service
871          *    is tcp, which has not this problem),
872          *    so that the last trick works only on connected
873          *    sockets.
874          * 2. oif also should be the same.
875          */
876         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
877 #ifdef CONFIG_IPV6_SUBTREES
878             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
879 #endif
880            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
881               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
882                 dst_release(dst);
883                 dst = NULL;
884         }
885
886 out:
887         return dst;
888 }
889
890 static int ip6_dst_lookup_tail(struct net *net, struct sock *sk,
891                                struct dst_entry **dst, struct flowi6 *fl6)
892 {
893 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
894         struct neighbour *n;
895         struct rt6_info *rt;
896 #endif
897         int err;
898
899         /* The correct way to handle this would be to do
900          * ip6_route_get_saddr, and then ip6_route_output; however,
901          * the route-specific preferred source forces the
902          * ip6_route_output call _before_ ip6_route_get_saddr.
903          *
904          * In source specific routing (no src=any default route),
905          * ip6_route_output will fail given src=any saddr, though, so
906          * that's why we try it again later.
907          */
908         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
909                 struct rt6_info *rt;
910                 bool had_dst = *dst != NULL;
911
912                 if (!had_dst)
913                         *dst = ip6_route_output(net, sk, fl6);
914                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
915                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
916                                           sk ? inet6_sk(sk)->srcprefs : 0,
917                                           &fl6->saddr);
918                 if (err)
919                         goto out_err_release;
920
921                 /* If we had an erroneous initial result, pretend it
922                  * never existed and let the SA-enabled version take
923                  * over.
924                  */
925                 if (!had_dst && (*dst)->error) {
926                         dst_release(*dst);
927                         *dst = NULL;
928                 }
929         }
930
931         if (!*dst)
932                 *dst = ip6_route_output(net, sk, fl6);
933
934         err = (*dst)->error;
935         if (err)
936                 goto out_err_release;
937
938 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
939         /*
940          * Here if the dst entry we've looked up
941          * has a neighbour entry that is in the INCOMPLETE
942          * state and the src address from the flow is
943          * marked as OPTIMISTIC, we release the found
944          * dst entry and replace it instead with the
945          * dst entry of the nexthop router
946          */
947         rt = (struct rt6_info *) *dst;
948         rcu_read_lock_bh();
949         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
950                                       rt6_nexthop(rt, &fl6->daddr));
951         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
952         rcu_read_unlock_bh();
953
954         if (err) {
955                 struct inet6_ifaddr *ifp;
956                 struct flowi6 fl_gw6;
957                 int redirect;
958
959                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
960                                       (*dst)->dev, 1);
961
962                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
963                 if (ifp)
964                         in6_ifa_put(ifp);
965
966                 if (redirect) {
967                         /*
968                          * We need to get the dst entry for the
969                          * default router instead
970                          */
971                         dst_release(*dst);
972                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
973                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
974                         *dst = ip6_route_output(net, sk, &fl_gw6);
975                         err = (*dst)->error;
976                         if (err)
977                                 goto out_err_release;
978                 }
979         }
980 #endif
981
982         return 0;
983
984 out_err_release:
985         if (err == -ENETUNREACH)
986                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
987         dst_release(*dst);
988         *dst = NULL;
989         return err;
990 }
991
992 /**
993  *      ip6_dst_lookup - perform route lookup on flow
994  *      @sk: socket which provides route info
995  *      @dst: pointer to dst_entry * for result
996  *      @fl6: flow to lookup
997  *
998  *      This function performs a route lookup on the given flow.
999  *
1000  *      It returns zero on success, or a standard errno code on error.
1001  */
1002 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1003                    struct flowi6 *fl6)
1004 {
1005         *dst = NULL;
1006         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1007 }
1008 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1009
1010 /**
1011  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1012  *      @sk: socket which provides route info
1013  *      @fl6: flow to lookup
1014  *      @final_dst: final destination address for ipsec lookup
1015  *
1016  *      This function performs a route lookup on the given flow.
1017  *
1018  *      It returns a valid dst pointer on success, or a pointer encoded
1019  *      error code.
1020  */
1021 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1022                                       const struct in6_addr *final_dst)
1023 {
1024         struct dst_entry *dst = NULL;
1025         int err;
1026
1027         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1028         if (err)
1029                 return ERR_PTR(err);
1030         if (final_dst)
1031                 fl6->daddr = *final_dst;
1032         if (!fl6->flowi6_oif)
1033                 fl6->flowi6_oif = dst->dev->ifindex;
1034
1035         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1036 }
1037 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1038
1039 /**
1040  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1041  *      @sk: socket which provides the dst cache and route info
1042  *      @fl6: flow to lookup
1043  *      @final_dst: final destination address for ipsec lookup
1044  *
1045  *      This function performs a route lookup on the given flow with the
1046  *      possibility of using the cached route in the socket if it is valid.
1047  *      It will take the socket dst lock when operating on the dst cache.
1048  *      As a result, this function can only be used in process context.
1049  *
1050  *      It returns a valid dst pointer on success, or a pointer encoded
1051  *      error code.
1052  */
1053 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1054                                          const struct in6_addr *final_dst)
1055 {
1056         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1057         int err;
1058
1059         dst = ip6_sk_dst_check(sk, dst, fl6);
1060
1061         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1062         if (err)
1063                 return ERR_PTR(err);
1064         if (final_dst)
1065                 fl6->daddr = *final_dst;
1066
1067         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1068 }
1069 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1070
1071 static inline int ip6_ufo_append_data(struct sock *sk,
1072                         struct sk_buff_head *queue,
1073                         int getfrag(void *from, char *to, int offset, int len,
1074                         int odd, struct sk_buff *skb),
1075                         void *from, int length, int hh_len, int fragheaderlen,
1076                         int transhdrlen, int mtu, unsigned int flags,
1077                         const struct flowi6 *fl6)
1078
1079 {
1080         struct sk_buff *skb;
1081         int err;
1082
1083         /* There is support for UDP large send offload by network
1084          * device, so create one single skb packet containing complete
1085          * udp datagram
1086          */
1087         skb = skb_peek_tail(queue);
1088         if (!skb) {
1089                 skb = sock_alloc_send_skb(sk,
1090                         hh_len + fragheaderlen + transhdrlen + 20,
1091                         (flags & MSG_DONTWAIT), &err);
1092                 if (!skb)
1093                         return err;
1094
1095                 /* reserve space for Hardware header */
1096                 skb_reserve(skb, hh_len);
1097
1098                 /* create space for UDP/IP header */
1099                 skb_put(skb, fragheaderlen + transhdrlen);
1100
1101                 /* initialize network header pointer */
1102                 skb_reset_network_header(skb);
1103
1104                 /* initialize protocol header pointer */
1105                 skb->transport_header = skb->network_header + fragheaderlen;
1106
1107                 skb->protocol = htons(ETH_P_IPV6);
1108                 skb->csum = 0;
1109
1110                 __skb_queue_tail(queue, skb);
1111         } else if (skb_is_gso(skb)) {
1112                 goto append;
1113         }
1114
1115         skb->ip_summed = CHECKSUM_PARTIAL;
1116         /* Specify the length of each IPv6 datagram fragment.
1117          * It has to be a multiple of 8.
1118          */
1119         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1120                                      sizeof(struct frag_hdr)) & ~7;
1121         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1122         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1123                                                          &fl6->daddr,
1124                                                          &fl6->saddr);
1125
1126 append:
1127         return skb_append_datato_frags(sk, skb, getfrag, from,
1128                                        (length - transhdrlen));
1129 }
1130
1131 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1132                                                gfp_t gfp)
1133 {
1134         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1135 }
1136
1137 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1138                                                 gfp_t gfp)
1139 {
1140         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1141 }
1142
1143 static void ip6_append_data_mtu(unsigned int *mtu,
1144                                 int *maxfraglen,
1145                                 unsigned int fragheaderlen,
1146                                 struct sk_buff *skb,
1147                                 struct rt6_info *rt,
1148                                 unsigned int orig_mtu)
1149 {
1150         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1151                 if (!skb) {
1152                         /* first fragment, reserve header_len */
1153                         *mtu = orig_mtu - rt->dst.header_len;
1154
1155                 } else {
1156                         /*
1157                          * this fragment is not first, the headers
1158                          * space is regarded as data space.
1159                          */
1160                         *mtu = orig_mtu;
1161                 }
1162                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1163                               + fragheaderlen - sizeof(struct frag_hdr);
1164         }
1165 }
1166
1167 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1168                           struct inet6_cork *v6_cork,
1169                           int hlimit, int tclass, struct ipv6_txoptions *opt,
1170                           struct rt6_info *rt, struct flowi6 *fl6)
1171 {
1172         struct ipv6_pinfo *np = inet6_sk(sk);
1173         unsigned int mtu;
1174
1175         /*
1176          * setup for corking
1177          */
1178         if (opt) {
1179                 if (WARN_ON(v6_cork->opt))
1180                         return -EINVAL;
1181
1182                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1183                 if (unlikely(!v6_cork->opt))
1184                         return -ENOBUFS;
1185
1186                 v6_cork->opt->tot_len = opt->tot_len;
1187                 v6_cork->opt->opt_flen = opt->opt_flen;
1188                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1189
1190                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1191                                                     sk->sk_allocation);
1192                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1193                         return -ENOBUFS;
1194
1195                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1196                                                     sk->sk_allocation);
1197                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1198                         return -ENOBUFS;
1199
1200                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1201                                                    sk->sk_allocation);
1202                 if (opt->hopopt && !v6_cork->opt->hopopt)
1203                         return -ENOBUFS;
1204
1205                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1206                                                     sk->sk_allocation);
1207                 if (opt->srcrt && !v6_cork->opt->srcrt)
1208                         return -ENOBUFS;
1209
1210                 /* need source address above miyazawa*/
1211         }
1212         dst_hold(&rt->dst);
1213         cork->base.dst = &rt->dst;
1214         cork->fl.u.ip6 = *fl6;
1215         v6_cork->hop_limit = hlimit;
1216         v6_cork->tclass = tclass;
1217         if (rt->dst.flags & DST_XFRM_TUNNEL)
1218                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1219                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1220         else
1221                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1222                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1223         if (np->frag_size < mtu) {
1224                 if (np->frag_size)
1225                         mtu = np->frag_size;
1226         }
1227         cork->base.fragsize = mtu;
1228         if (dst_allfrag(rt->dst.path))
1229                 cork->base.flags |= IPCORK_ALLFRAG;
1230         cork->base.length = 0;
1231
1232         return 0;
1233 }
1234
1235 static int __ip6_append_data(struct sock *sk,
1236                              struct flowi6 *fl6,
1237                              struct sk_buff_head *queue,
1238                              struct inet_cork *cork,
1239                              struct inet6_cork *v6_cork,
1240                              struct page_frag *pfrag,
1241                              int getfrag(void *from, char *to, int offset,
1242                                          int len, int odd, struct sk_buff *skb),
1243                              void *from, int length, int transhdrlen,
1244                              unsigned int flags, int dontfrag)
1245 {
1246         struct sk_buff *skb, *skb_prev = NULL;
1247         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1248         int exthdrlen = 0;
1249         int dst_exthdrlen = 0;
1250         int hh_len;
1251         int copy;
1252         int err;
1253         int offset = 0;
1254         __u8 tx_flags = 0;
1255         u32 tskey = 0;
1256         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1257         struct ipv6_txoptions *opt = v6_cork->opt;
1258         int csummode = CHECKSUM_NONE;
1259
1260         skb = skb_peek_tail(queue);
1261         if (!skb) {
1262                 exthdrlen = opt ? opt->opt_flen : 0;
1263                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1264         }
1265
1266         mtu = cork->fragsize;
1267         orig_mtu = mtu;
1268
1269         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1270
1271         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1272                         (opt ? opt->opt_nflen : 0);
1273         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1274                      sizeof(struct frag_hdr);
1275
1276         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1277                 unsigned int maxnonfragsize, headersize;
1278
1279                 headersize = sizeof(struct ipv6hdr) +
1280                              (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1281                              (dst_allfrag(&rt->dst) ?
1282                               sizeof(struct frag_hdr) : 0) +
1283                              rt->rt6i_nfheader_len;
1284
1285                 if (ip6_sk_ignore_df(sk))
1286                         maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1287                 else
1288                         maxnonfragsize = mtu;
1289
1290                 /* dontfrag active */
1291                 if ((cork->length + length > mtu - headersize) && dontfrag &&
1292                     (sk->sk_protocol == IPPROTO_UDP ||
1293                      sk->sk_protocol == IPPROTO_RAW)) {
1294                         ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1295                                                    sizeof(struct ipv6hdr));
1296                         goto emsgsize;
1297                 }
1298
1299                 if (cork->length + length > maxnonfragsize - headersize) {
1300 emsgsize:
1301                         ipv6_local_error(sk, EMSGSIZE, fl6,
1302                                          mtu - headersize +
1303                                          sizeof(struct ipv6hdr));
1304                         return -EMSGSIZE;
1305                 }
1306         }
1307
1308         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1309                 sock_tx_timestamp(sk, &tx_flags);
1310                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1311                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1312                         tskey = sk->sk_tskey++;
1313         }
1314
1315         /* If this is the first and only packet and device
1316          * supports checksum offloading, let's use it.
1317          * Use transhdrlen, same as IPv4, because partial
1318          * sums only work when transhdrlen is set.
1319          */
1320         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1321             length + fragheaderlen < mtu &&
1322             rt->dst.dev->features & NETIF_F_V6_CSUM &&
1323             !exthdrlen)
1324                 csummode = CHECKSUM_PARTIAL;
1325         /*
1326          * Let's try using as much space as possible.
1327          * Use MTU if total length of the message fits into the MTU.
1328          * Otherwise, we need to reserve fragment header and
1329          * fragment alignment (= 8-15 octects, in total).
1330          *
1331          * Note that we may need to "move" the data from the tail of
1332          * of the buffer to the new fragment when we split
1333          * the message.
1334          *
1335          * FIXME: It may be fragmented into multiple chunks
1336          *        at once if non-fragmentable extension headers
1337          *        are too large.
1338          * --yoshfuji
1339          */
1340
1341         cork->length += length;
1342         if (((length > mtu) ||
1343              (skb && skb_is_gso(skb))) &&
1344             (sk->sk_protocol == IPPROTO_UDP) &&
1345             (rt->dst.dev->features & NETIF_F_UFO) &&
1346             (sk->sk_type == SOCK_DGRAM)) {
1347                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1348                                           hh_len, fragheaderlen,
1349                                           transhdrlen, mtu, flags, fl6);
1350                 if (err)
1351                         goto error;
1352                 return 0;
1353         }
1354
1355         if (!skb)
1356                 goto alloc_new_skb;
1357
1358         while (length > 0) {
1359                 /* Check if the remaining data fits into current packet. */
1360                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1361                 if (copy < length)
1362                         copy = maxfraglen - skb->len;
1363
1364                 if (copy <= 0) {
1365                         char *data;
1366                         unsigned int datalen;
1367                         unsigned int fraglen;
1368                         unsigned int fraggap;
1369                         unsigned int alloclen;
1370 alloc_new_skb:
1371                         /* There's no room in the current skb */
1372                         if (skb)
1373                                 fraggap = skb->len - maxfraglen;
1374                         else
1375                                 fraggap = 0;
1376                         /* update mtu and maxfraglen if necessary */
1377                         if (!skb || !skb_prev)
1378                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1379                                                     fragheaderlen, skb, rt,
1380                                                     orig_mtu);
1381
1382                         skb_prev = skb;
1383
1384                         /*
1385                          * If remaining data exceeds the mtu,
1386                          * we know we need more fragment(s).
1387                          */
1388                         datalen = length + fraggap;
1389
1390                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1391                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1392                         if ((flags & MSG_MORE) &&
1393                             !(rt->dst.dev->features&NETIF_F_SG))
1394                                 alloclen = mtu;
1395                         else
1396                                 alloclen = datalen + fragheaderlen;
1397
1398                         alloclen += dst_exthdrlen;
1399
1400                         if (datalen != length + fraggap) {
1401                                 /*
1402                                  * this is not the last fragment, the trailer
1403                                  * space is regarded as data space.
1404                                  */
1405                                 datalen += rt->dst.trailer_len;
1406                         }
1407
1408                         alloclen += rt->dst.trailer_len;
1409                         fraglen = datalen + fragheaderlen;
1410
1411                         /*
1412                          * We just reserve space for fragment header.
1413                          * Note: this may be overallocation if the message
1414                          * (without MSG_MORE) fits into the MTU.
1415                          */
1416                         alloclen += sizeof(struct frag_hdr);
1417
1418                         if (transhdrlen) {
1419                                 skb = sock_alloc_send_skb(sk,
1420                                                 alloclen + hh_len,
1421                                                 (flags & MSG_DONTWAIT), &err);
1422                         } else {
1423                                 skb = NULL;
1424                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1425                                     2 * sk->sk_sndbuf)
1426                                         skb = sock_wmalloc(sk,
1427                                                            alloclen + hh_len, 1,
1428                                                            sk->sk_allocation);
1429                                 if (unlikely(!skb))
1430                                         err = -ENOBUFS;
1431                         }
1432                         if (!skb)
1433                                 goto error;
1434                         /*
1435                          *      Fill in the control structures
1436                          */
1437                         skb->protocol = htons(ETH_P_IPV6);
1438                         skb->ip_summed = csummode;
1439                         skb->csum = 0;
1440                         /* reserve for fragmentation and ipsec header */
1441                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1442                                     dst_exthdrlen);
1443
1444                         /* Only the initial fragment is time stamped */
1445                         skb_shinfo(skb)->tx_flags = tx_flags;
1446                         tx_flags = 0;
1447                         skb_shinfo(skb)->tskey = tskey;
1448                         tskey = 0;
1449
1450                         /*
1451                          *      Find where to start putting bytes
1452                          */
1453                         data = skb_put(skb, fraglen);
1454                         skb_set_network_header(skb, exthdrlen);
1455                         data += fragheaderlen;
1456                         skb->transport_header = (skb->network_header +
1457                                                  fragheaderlen);
1458                         if (fraggap) {
1459                                 skb->csum = skb_copy_and_csum_bits(
1460                                         skb_prev, maxfraglen,
1461                                         data + transhdrlen, fraggap, 0);
1462                                 skb_prev->csum = csum_sub(skb_prev->csum,
1463                                                           skb->csum);
1464                                 data += fraggap;
1465                                 pskb_trim_unique(skb_prev, maxfraglen);
1466                         }
1467                         copy = datalen - transhdrlen - fraggap;
1468
1469                         if (copy < 0) {
1470                                 err = -EINVAL;
1471                                 kfree_skb(skb);
1472                                 goto error;
1473                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1474                                 err = -EFAULT;
1475                                 kfree_skb(skb);
1476                                 goto error;
1477                         }
1478
1479                         offset += copy;
1480                         length -= datalen - fraggap;
1481                         transhdrlen = 0;
1482                         exthdrlen = 0;
1483                         dst_exthdrlen = 0;
1484
1485                         /*
1486                          * Put the packet on the pending queue
1487                          */
1488                         __skb_queue_tail(queue, skb);
1489                         continue;
1490                 }
1491
1492                 if (copy > length)
1493                         copy = length;
1494
1495                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1496                         unsigned int off;
1497
1498                         off = skb->len;
1499                         if (getfrag(from, skb_put(skb, copy),
1500                                                 offset, copy, off, skb) < 0) {
1501                                 __skb_trim(skb, off);
1502                                 err = -EFAULT;
1503                                 goto error;
1504                         }
1505                 } else {
1506                         int i = skb_shinfo(skb)->nr_frags;
1507
1508                         err = -ENOMEM;
1509                         if (!sk_page_frag_refill(sk, pfrag))
1510                                 goto error;
1511
1512                         if (!skb_can_coalesce(skb, i, pfrag->page,
1513                                               pfrag->offset)) {
1514                                 err = -EMSGSIZE;
1515                                 if (i == MAX_SKB_FRAGS)
1516                                         goto error;
1517
1518                                 __skb_fill_page_desc(skb, i, pfrag->page,
1519                                                      pfrag->offset, 0);
1520                                 skb_shinfo(skb)->nr_frags = ++i;
1521                                 get_page(pfrag->page);
1522                         }
1523                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1524                         if (getfrag(from,
1525                                     page_address(pfrag->page) + pfrag->offset,
1526                                     offset, copy, skb->len, skb) < 0)
1527                                 goto error_efault;
1528
1529                         pfrag->offset += copy;
1530                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1531                         skb->len += copy;
1532                         skb->data_len += copy;
1533                         skb->truesize += copy;
1534                         atomic_add(copy, &sk->sk_wmem_alloc);
1535                 }
1536                 offset += copy;
1537                 length -= copy;
1538         }
1539
1540         return 0;
1541
1542 error_efault:
1543         err = -EFAULT;
1544 error:
1545         cork->length -= length;
1546         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1547         return err;
1548 }
1549
1550 int ip6_append_data(struct sock *sk,
1551                     int getfrag(void *from, char *to, int offset, int len,
1552                                 int odd, struct sk_buff *skb),
1553                     void *from, int length, int transhdrlen, int hlimit,
1554                     int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1555                     struct rt6_info *rt, unsigned int flags, int dontfrag)
1556 {
1557         struct inet_sock *inet = inet_sk(sk);
1558         struct ipv6_pinfo *np = inet6_sk(sk);
1559         int exthdrlen;
1560         int err;
1561
1562         if (flags&MSG_PROBE)
1563                 return 0;
1564         if (skb_queue_empty(&sk->sk_write_queue)) {
1565                 /*
1566                  * setup for corking
1567                  */
1568                 err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1569                                      tclass, opt, rt, fl6);
1570                 if (err)
1571                         return err;
1572
1573                 exthdrlen = (opt ? opt->opt_flen : 0);
1574                 length += exthdrlen;
1575                 transhdrlen += exthdrlen;
1576         } else {
1577                 fl6 = &inet->cork.fl.u.ip6;
1578                 transhdrlen = 0;
1579         }
1580
1581         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1582                                  &np->cork, sk_page_frag(sk), getfrag,
1583                                  from, length, transhdrlen, flags, dontfrag);
1584 }
1585 EXPORT_SYMBOL_GPL(ip6_append_data);
1586
1587 static void ip6_cork_release(struct inet_cork_full *cork,
1588                              struct inet6_cork *v6_cork)
1589 {
1590         if (v6_cork->opt) {
1591                 kfree(v6_cork->opt->dst0opt);
1592                 kfree(v6_cork->opt->dst1opt);
1593                 kfree(v6_cork->opt->hopopt);
1594                 kfree(v6_cork->opt->srcrt);
1595                 kfree(v6_cork->opt);
1596                 v6_cork->opt = NULL;
1597         }
1598
1599         if (cork->base.dst) {
1600                 dst_release(cork->base.dst);
1601                 cork->base.dst = NULL;
1602                 cork->base.flags &= ~IPCORK_ALLFRAG;
1603         }
1604         memset(&cork->fl, 0, sizeof(cork->fl));
1605 }
1606
1607 struct sk_buff *__ip6_make_skb(struct sock *sk,
1608                                struct sk_buff_head *queue,
1609                                struct inet_cork_full *cork,
1610                                struct inet6_cork *v6_cork)
1611 {
1612         struct sk_buff *skb, *tmp_skb;
1613         struct sk_buff **tail_skb;
1614         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1615         struct ipv6_pinfo *np = inet6_sk(sk);
1616         struct net *net = sock_net(sk);
1617         struct ipv6hdr *hdr;
1618         struct ipv6_txoptions *opt = v6_cork->opt;
1619         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1620         struct flowi6 *fl6 = &cork->fl.u.ip6;
1621         unsigned char proto = fl6->flowi6_proto;
1622
1623         skb = __skb_dequeue(queue);
1624         if (!skb)
1625                 goto out;
1626         tail_skb = &(skb_shinfo(skb)->frag_list);
1627
1628         /* move skb->data to ip header from ext header */
1629         if (skb->data < skb_network_header(skb))
1630                 __skb_pull(skb, skb_network_offset(skb));
1631         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1632                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1633                 *tail_skb = tmp_skb;
1634                 tail_skb = &(tmp_skb->next);
1635                 skb->len += tmp_skb->len;
1636                 skb->data_len += tmp_skb->len;
1637                 skb->truesize += tmp_skb->truesize;
1638                 tmp_skb->destructor = NULL;
1639                 tmp_skb->sk = NULL;
1640         }
1641
1642         /* Allow local fragmentation. */
1643         skb->ignore_df = ip6_sk_ignore_df(sk);
1644
1645         *final_dst = fl6->daddr;
1646         __skb_pull(skb, skb_network_header_len(skb));
1647         if (opt && opt->opt_flen)
1648                 ipv6_push_frag_opts(skb, opt, &proto);
1649         if (opt && opt->opt_nflen)
1650                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1651
1652         skb_push(skb, sizeof(struct ipv6hdr));
1653         skb_reset_network_header(skb);
1654         hdr = ipv6_hdr(skb);
1655
1656         ip6_flow_hdr(hdr, v6_cork->tclass,
1657                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1658                                         np->autoflowlabel, fl6));
1659         hdr->hop_limit = v6_cork->hop_limit;
1660         hdr->nexthdr = proto;
1661         hdr->saddr = fl6->saddr;
1662         hdr->daddr = *final_dst;
1663
1664         skb->priority = sk->sk_priority;
1665         skb->mark = sk->sk_mark;
1666
1667         skb_dst_set(skb, dst_clone(&rt->dst));
1668         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1669         if (proto == IPPROTO_ICMPV6) {
1670                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1671
1672                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1673                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1674         }
1675
1676         ip6_cork_release(cork, v6_cork);
1677 out:
1678         return skb;
1679 }
1680
1681 int ip6_send_skb(struct sk_buff *skb)
1682 {
1683         struct net *net = sock_net(skb->sk);
1684         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1685         int err;
1686
1687         err = ip6_local_out(skb);
1688         if (err) {
1689                 if (err > 0)
1690                         err = net_xmit_errno(err);
1691                 if (err)
1692                         IP6_INC_STATS(net, rt->rt6i_idev,
1693                                       IPSTATS_MIB_OUTDISCARDS);
1694         }
1695
1696         return err;
1697 }
1698
1699 int ip6_push_pending_frames(struct sock *sk)
1700 {
1701         struct sk_buff *skb;
1702
1703         skb = ip6_finish_skb(sk);
1704         if (!skb)
1705                 return 0;
1706
1707         return ip6_send_skb(skb);
1708 }
1709 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1710
1711 static void __ip6_flush_pending_frames(struct sock *sk,
1712                                        struct sk_buff_head *queue,
1713                                        struct inet_cork_full *cork,
1714                                        struct inet6_cork *v6_cork)
1715 {
1716         struct sk_buff *skb;
1717
1718         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1719                 if (skb_dst(skb))
1720                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1721                                       IPSTATS_MIB_OUTDISCARDS);
1722                 kfree_skb(skb);
1723         }
1724
1725         ip6_cork_release(cork, v6_cork);
1726 }
1727
1728 void ip6_flush_pending_frames(struct sock *sk)
1729 {
1730         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1731                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1732 }
1733 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1734
1735 struct sk_buff *ip6_make_skb(struct sock *sk,
1736                              int getfrag(void *from, char *to, int offset,
1737                                          int len, int odd, struct sk_buff *skb),
1738                              void *from, int length, int transhdrlen,
1739                              int hlimit, int tclass,
1740                              struct ipv6_txoptions *opt, struct flowi6 *fl6,
1741                              struct rt6_info *rt, unsigned int flags,
1742                              int dontfrag)
1743 {
1744         struct inet_cork_full cork;
1745         struct inet6_cork v6_cork;
1746         struct sk_buff_head queue;
1747         int exthdrlen = (opt ? opt->opt_flen : 0);
1748         int err;
1749
1750         if (flags & MSG_PROBE)
1751                 return NULL;
1752
1753         __skb_queue_head_init(&queue);
1754
1755         cork.base.flags = 0;
1756         cork.base.addr = 0;
1757         cork.base.opt = NULL;
1758         v6_cork.opt = NULL;
1759         err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1760         if (err)
1761                 return ERR_PTR(err);
1762
1763         if (dontfrag < 0)
1764                 dontfrag = inet6_sk(sk)->dontfrag;
1765
1766         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1767                                 &current->task_frag, getfrag, from,
1768                                 length + exthdrlen, transhdrlen + exthdrlen,
1769                                 flags, dontfrag);
1770         if (err) {
1771                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1772                 return ERR_PTR(err);
1773         }
1774
1775         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1776 }