net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/overflow-arith.h>
  32 #include <linux/string.h>
  33 #include <linux/socket.h>
  34 #include <linux/net.h>
  35 #include <linux/netdevice.h>
  36 #include <linux/if_arp.h>
  37 #include <linux/in6.h>
  38 #include <linux/tcp.h>
  39 #include <linux/route.h>
  40 #include <linux/module.h>
  41 #include <linux/slab.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60
  61 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  62 {
  63         struct dst_entry *dst = skb_dst(skb);
  64         struct net_device *dev = dst->dev;
  65         struct neighbour *neigh;
  66         struct in6_addr *nexthop;
  67         int ret;
  68
  69         skb->protocol = htons(ETH_P_IPV6);
  70         skb->dev = dev;
  71
  72         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  73                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  74
  75                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  76                     ((mroute6_socket(net, skb) &&
  77                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  78                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  79                                          &ipv6_hdr(skb)->saddr))) {
  80                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  81
  82                         /* Do not check for IFF_ALLMULTI; multicast routing
  83                            is not supported in any case.
  84                          */
  85                         if (newskb)
  86                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  87                                         net, sk, newskb, NULL, newskb->dev,
  88                                         dev_loopback_xmit);
  89
  90                         if (ipv6_hdr(skb)->hop_limit == 0) {
  91                                 IP6_INC_STATS(net, idev,
  92                                               IPSTATS_MIB_OUTDISCARDS);
  93                                 kfree_skb(skb);
  94                                 return 0;
  95                         }
  96                 }
  97
  98                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  99
 100                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 101                     IPV6_ADDR_SCOPE_NODELOCAL &&
 102                     !(dev->flags & IFF_LOOPBACK)) {
 103                         kfree_skb(skb);
 104                         return 0;
 105                 }
 106         }
 107
 108         rcu_read_lock_bh();
 109         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 110         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 111         if (unlikely(!neigh))
 112                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 113         if (!IS_ERR(neigh)) {
 114                 ret = dst_neigh_output(dst, neigh, skb);
 115                 rcu_read_unlock_bh();
 116                 return ret;
 117         }
 118         rcu_read_unlock_bh();
 119
 120         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 121         kfree_skb(skb);
 122         return -EINVAL;
 123 }
 124
 125 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 126 {
 127         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 128             dst_allfrag(skb_dst(skb)) ||
 129             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 130                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 131         else
 132                 return ip6_finish_output2(net, sk, skb);
 133 }
 134
 135 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 136 {
 137         struct net_device *dev = skb_dst(skb)->dev;
 138         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 139
 140         if (unlikely(idev->cnf.disable_ipv6)) {
 141                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 142                 kfree_skb(skb);
 143                 return 0;
 144         }
 145
 146         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 147                             net, sk, skb, NULL, dev,
 148                             ip6_finish_output,
 149                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 150 }
 151
 152 /*
 153  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 154  * Note : socket lock is not held for SYNACK packets, but might be modified
 155  * by calls to skb_set_owner_w() and ipv6_local_error(),
 156  * which are using proper atomic operations or spinlocks.
 157  */
 158 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 159              struct ipv6_txoptions *opt, int tclass)
 160 {
 161         struct net *net = sock_net(sk);
 162         const struct ipv6_pinfo *np = inet6_sk(sk);
 163         struct in6_addr *first_hop = &fl6->daddr;
 164         struct dst_entry *dst = skb_dst(skb);
 165         struct ipv6hdr *hdr;
 166         u8  proto = fl6->flowi6_proto;
 167         int seg_len = skb->len;
 168         int hlimit = -1;
 169         u32 mtu;
 170
 171         if (opt) {
 172                 unsigned int head_room;
 173
 174                 /* First: exthdrs may take lots of space (~8K for now)
 175                    MAX_HEADER is not enough.
 176                  */
 177                 head_room = opt->opt_nflen + opt->opt_flen;
 178                 seg_len += head_room;
 179                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 180
 181                 if (skb_headroom(skb) < head_room) {
 182                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 183                         if (!skb2) {
 184                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 185                                               IPSTATS_MIB_OUTDISCARDS);
 186                                 kfree_skb(skb);
 187                                 return -ENOBUFS;
 188                         }
 189                         consume_skb(skb);
 190                         skb = skb2;
 191                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 192                          * it is safe to call in our context (socket lock not held)
 193                          */
 194                         skb_set_owner_w(skb, (struct sock *)sk);
 195                 }
 196                 if (opt->opt_flen)
 197                         ipv6_push_frag_opts(skb, opt, &proto);
 198                 if (opt->opt_nflen)
 199                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 200         }
 201
 202         skb_push(skb, sizeof(struct ipv6hdr));
 203         skb_reset_network_header(skb);
 204         hdr = ipv6_hdr(skb);
 205
 206         /*
 207          *      Fill in the IPv6 header
 208          */
 209         if (np)
 210                 hlimit = np->hop_limit;
 211         if (hlimit < 0)
 212                 hlimit = ip6_dst_hoplimit(dst);
 213
 214         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 215                                                      np->autoflowlabel, fl6));
 216
 217         hdr->payload_len = htons(seg_len);
 218         hdr->nexthdr = proto;
 219         hdr->hop_limit = hlimit;
 220
 221         hdr->saddr = fl6->saddr;
 222         hdr->daddr = *first_hop;
 223
 224         skb->protocol = htons(ETH_P_IPV6);
 225         skb->priority = sk->sk_priority;
 226         skb->mark = sk->sk_mark;
 227
 228         mtu = dst_mtu(dst);
 229         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 230                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 231                               IPSTATS_MIB_OUT, skb->len);
 232                 /* hooks should never assume socket lock is held.
 233                  * we promote our socket to non const
 234                  */
 235                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 236                                net, (struct sock *)sk, skb, NULL, dst->dev,
 237                                dst_output);
 238         }
 239
 240         skb->dev = dst->dev;
 241         /* ipv6_local_error() does not require socket lock,
 242          * we promote our socket to non const
 243          */
 244         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 245
 246         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 247         kfree_skb(skb);
 248         return -EMSGSIZE;
 249 }
 250 EXPORT_SYMBOL(ip6_xmit);
 251
 252 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 253 {
 254         struct ip6_ra_chain *ra;
 255         struct sock *last = NULL;
 256
 257         read_lock(&ip6_ra_lock);
 258         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 259                 struct sock *sk = ra->sk;
 260                 if (sk && ra->sel == sel &&
 261                     (!sk->sk_bound_dev_if ||
 262                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 263                         if (last) {
 264                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 265                                 if (skb2)
 266                                         rawv6_rcv(last, skb2);
 267                         }
 268                         last = sk;
 269                 }
 270         }
 271
 272         if (last) {
 273                 rawv6_rcv(last, skb);
 274                 read_unlock(&ip6_ra_lock);
 275                 return 1;
 276         }
 277         read_unlock(&ip6_ra_lock);
 278         return 0;
 279 }
 280
 281 static int ip6_forward_proxy_check(struct sk_buff *skb)
 282 {
 283         struct ipv6hdr *hdr = ipv6_hdr(skb);
 284         u8 nexthdr = hdr->nexthdr;
 285         __be16 frag_off;
 286         int offset;
 287
 288         if (ipv6_ext_hdr(nexthdr)) {
 289                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 290                 if (offset < 0)
 291                         return 0;
 292         } else
 293                 offset = sizeof(struct ipv6hdr);
 294
 295         if (nexthdr == IPPROTO_ICMPV6) {
 296                 struct icmp6hdr *icmp6;
 297
 298                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 299                                          offset + 1 - skb->data)))
 300                         return 0;
 301
 302                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 303
 304                 switch (icmp6->icmp6_type) {
 305                 case NDISC_ROUTER_SOLICITATION:
 306                 case NDISC_ROUTER_ADVERTISEMENT:
 307                 case NDISC_NEIGHBOUR_SOLICITATION:
 308                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 309                 case NDISC_REDIRECT:
 310                         /* For reaction involving unicast neighbor discovery
 311                          * message destined to the proxied address, pass it to
 312                          * input function.
 313                          */
 314                         return 1;
 315                 default:
 316                         break;
 317                 }
 318         }
 319
 320         /*
 321          * The proxying router can't forward traffic sent to a link-local
 322          * address, so signal the sender and discard the packet. This
 323          * behavior is clarified by the MIPv6 specification.
 324          */
 325         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 326                 dst_link_failure(skb);
 327                 return -1;
 328         }
 329
 330         return 0;
 331 }
 332
 333 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 334                                      struct sk_buff *skb)
 335 {
 336         skb_sender_cpu_clear(skb);
 337         return dst_output(net, sk, skb);
 338 }
 339
 340 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 341 {
 342         unsigned int mtu;
 343         struct inet6_dev *idev;
 344
 345         if (dst_metric_locked(dst, RTAX_MTU)) {
 346                 mtu = dst_metric_raw(dst, RTAX_MTU);
 347                 if (mtu)
 348                         return mtu;
 349         }
 350
 351         mtu = IPV6_MIN_MTU;
 352         rcu_read_lock();
 353         idev = __in6_dev_get(dst->dev);
 354         if (idev)
 355                 mtu = idev->cnf.mtu6;
 356         rcu_read_unlock();
 357
 358         return mtu;
 359 }
 360
 361 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 362 {
 363         if (skb->len <= mtu)
 364                 return false;
 365
 366         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 367         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 368                 return true;
 369
 370         if (skb->ignore_df)
 371                 return false;
 372
 373         if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
 374                 return false;
 375
 376         return true;
 377 }
 378
 379 int ip6_forward(struct sk_buff *skb)
 380 {
 381         struct dst_entry *dst = skb_dst(skb);
 382         struct ipv6hdr *hdr = ipv6_hdr(skb);
 383         struct inet6_skb_parm *opt = IP6CB(skb);
 384         struct net *net = dev_net(dst->dev);
 385         u32 mtu;
 386
 387         if (net->ipv6.devconf_all->forwarding == 0)
 388                 goto error;
 389
 390         if (skb->pkt_type != PACKET_HOST)
 391                 goto drop;
 392
 393         if (unlikely(skb->sk))
 394                 goto drop;
 395
 396         if (skb_warn_if_lro(skb))
 397                 goto drop;
 398
 399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 400                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 401                                  IPSTATS_MIB_INDISCARDS);
 402                 goto drop;
 403         }
 404
 405         skb_forward_csum(skb);
 406
 407         /*
 408          *      We DO NOT make any processing on
 409          *      RA packets, pushing them to user level AS IS
 410          *      without ane WARRANTY that application will be able
 411          *      to interpret them. The reason is that we
 412          *      cannot make anything clever here.
 413          *
 414          *      We are not end-node, so that if packet contains
 415          *      AH/ESP, we cannot make anything.
 416          *      Defragmentation also would be mistake, RA packets
 417          *      cannot be fragmented, because there is no warranty
 418          *      that different fragments will go along one path. --ANK
 419          */
 420         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 421                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 422                         return 0;
 423         }
 424
 425         /*
 426          *      check and decrement ttl
 427          */
 428         if (hdr->hop_limit <= 1) {
 429                 /* Force OUTPUT device used as source address */
 430                 skb->dev = dst->dev;
 431                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 432                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 433                                  IPSTATS_MIB_INHDRERRORS);
 434
 435                 kfree_skb(skb);
 436                 return -ETIMEDOUT;
 437         }
 438
 439         /* XXX: idev->cnf.proxy_ndp? */
 440         if (net->ipv6.devconf_all->proxy_ndp &&
 441             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 442                 int proxied = ip6_forward_proxy_check(skb);
 443                 if (proxied > 0)
 444                         return ip6_input(skb);
 445                 else if (proxied < 0) {
 446                         IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 447                                          IPSTATS_MIB_INDISCARDS);
 448                         goto drop;
 449                 }
 450         }
 451
 452         if (!xfrm6_route_forward(skb)) {
 453                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 454                                  IPSTATS_MIB_INDISCARDS);
 455                 goto drop;
 456         }
 457         dst = skb_dst(skb);
 458
 459         /* IPv6 specs say nothing about it, but it is clear that we cannot
 460            send redirects to source routed frames.
 461            We don't send redirects to frames decapsulated from IPsec.
 462          */
 463         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 464                 struct in6_addr *target = NULL;
 465                 struct inet_peer *peer;
 466                 struct rt6_info *rt;
 467
 468                 /*
 469                  *      incoming and outgoing devices are the same
 470                  *      send a redirect.
 471                  */
 472
 473                 rt = (struct rt6_info *) dst;
 474                 if (rt->rt6i_flags & RTF_GATEWAY)
 475                         target = &rt->rt6i_gateway;
 476                 else
 477                         target = &hdr->daddr;
 478
 479                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 480
 481                 /* Limit redirects both by destination (here)
 482                    and by source (inside ndisc_send_redirect)
 483                  */
 484                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 485                         ndisc_send_redirect(skb, target);
 486                 if (peer)
 487                         inet_putpeer(peer);
 488         } else {
 489                 int addrtype = ipv6_addr_type(&hdr->saddr);
 490
 491                 /* This check is security critical. */
 492                 if (addrtype == IPV6_ADDR_ANY ||
 493                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 494                         goto error;
 495                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 496                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 497                                     ICMPV6_NOT_NEIGHBOUR, 0);
 498                         goto error;
 499                 }
 500         }
 501
 502         mtu = ip6_dst_mtu_forward(dst);
 503         if (mtu < IPV6_MIN_MTU)
 504                 mtu = IPV6_MIN_MTU;
 505
 506         if (ip6_pkt_too_big(skb, mtu)) {
 507                 /* Again, force OUTPUT device used as source address */
 508                 skb->dev = dst->dev;
 509                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 510                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 511                                  IPSTATS_MIB_INTOOBIGERRORS);
 512                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 513                                  IPSTATS_MIB_FRAGFAILS);
 514                 kfree_skb(skb);
 515                 return -EMSGSIZE;
 516         }
 517
 518         if (skb_cow(skb, dst->dev->hard_header_len)) {
 519                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 520                                  IPSTATS_MIB_OUTDISCARDS);
 521                 goto drop;
 522         }
 523
 524         hdr = ipv6_hdr(skb);
 525
 526         /* Mangling hops number delayed to point after skb COW */
 527
 528         hdr->hop_limit--;
 529
 530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 531         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 532         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 533                        net, NULL, skb, skb->dev, dst->dev,
 534                        ip6_forward_finish);
 535
 536 error:
 537         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 538 drop:
 539         kfree_skb(skb);
 540         return -EINVAL;
 541 }
 542
 543 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 544 {
 545         to->pkt_type = from->pkt_type;
 546         to->priority = from->priority;
 547         to->protocol = from->protocol;
 548         skb_dst_drop(to);
 549         skb_dst_set(to, dst_clone(skb_dst(from)));
 550         to->dev = from->dev;
 551         to->mark = from->mark;
 552
 553 #ifdef CONFIG_NET_SCHED
 554         to->tc_index = from->tc_index;
 555 #endif
 556         nf_copy(to, from);
 557         skb_copy_secmark(to, from);
 558 }
 559
 560 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 561                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 562 {
 563         struct sk_buff *frag;
 564         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 565         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 566                                 inet6_sk(skb->sk) : NULL;
 567         struct ipv6hdr *tmp_hdr;
 568         struct frag_hdr *fh;
 569         unsigned int mtu, hlen, left, len;
 570         int hroom, troom;
 571         __be32 frag_id;
 572         int ptr, offset = 0, err = 0;
 573         u8 *prevhdr, nexthdr = 0;
 574
 575         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 576         nexthdr = *prevhdr;
 577
 578         mtu = ip6_skb_dst_mtu(skb);
 579
 580         /* We must not fragment if the socket is set to force MTU discovery
 581          * or if the skb it not generated by a local socket.
 582          */
 583         if (unlikely(!skb->ignore_df && skb->len > mtu))
 584                 goto fail_toobig;
 585
 586         if (IP6CB(skb)->frag_max_size) {
 587                 if (IP6CB(skb)->frag_max_size > mtu)
 588                         goto fail_toobig;
 589
 590                 /* don't send fragments larger than what we received */
 591                 mtu = IP6CB(skb)->frag_max_size;
 592                 if (mtu < IPV6_MIN_MTU)
 593                         mtu = IPV6_MIN_MTU;
 594         }
 595
 596         if (np && np->frag_size < mtu) {
 597                 if (np->frag_size)
 598                         mtu = np->frag_size;
 599         }
 600
 601         if (overflow_usub(mtu, hlen + sizeof(struct frag_hdr), &mtu) ||
 602             mtu <= 7)
 603                 goto fail_toobig;
 604
 605         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 606                                     &ipv6_hdr(skb)->saddr);
 607
 608         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 609         if (skb_has_frag_list(skb)) {
 610                 int first_len = skb_pagelen(skb);
 611                 struct sk_buff *frag2;
 612
 613                 if (first_len - hlen > mtu ||
 614                     ((first_len - hlen) & 7) ||
 615                     skb_cloned(skb) ||
 616                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 617                         goto slow_path;
 618
 619                 skb_walk_frags(skb, frag) {
 620                         /* Correct geometry. */
 621                         if (frag->len > mtu ||
 622                             ((frag->len & 7) && frag->next) ||
 623                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 624                                 goto slow_path_clean;
 625
 626                         /* Partially cloned skb? */
 627                         if (skb_shared(frag))
 628                                 goto slow_path_clean;
 629
 630                         BUG_ON(frag->sk);
 631                         if (skb->sk) {
 632                                 frag->sk = skb->sk;
 633                                 frag->destructor = sock_wfree;
 634                         }
 635                         skb->truesize -= frag->truesize;
 636                 }
 637
 638                 err = 0;
 639                 offset = 0;
 640                 /* BUILD HEADER */
 641
 642                 *prevhdr = NEXTHDR_FRAGMENT;
 643                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 644                 if (!tmp_hdr) {
 645                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 646                                       IPSTATS_MIB_FRAGFAILS);
 647                         err = -ENOMEM;
 648                         goto fail;
 649                 }
 650                 frag = skb_shinfo(skb)->frag_list;
 651                 skb_frag_list_init(skb);
 652
 653                 __skb_pull(skb, hlen);
 654                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
 655                 __skb_push(skb, hlen);
 656                 skb_reset_network_header(skb);
 657                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 658
 659                 fh->nexthdr = nexthdr;
 660                 fh->reserved = 0;
 661                 fh->frag_off = htons(IP6_MF);
 662                 fh->identification = frag_id;
 663
 664                 first_len = skb_pagelen(skb);
 665                 skb->data_len = first_len - skb_headlen(skb);
 666                 skb->len = first_len;
 667                 ipv6_hdr(skb)->payload_len = htons(first_len -
 668                                                    sizeof(struct ipv6hdr));
 669
 670                 dst_hold(&rt->dst);
 671
 672                 for (;;) {
 673                         /* Prepare header of the next frame,
 674                          * before previous one went down. */
 675                         if (frag) {
 676                                 frag->ip_summed = CHECKSUM_NONE;
 677                                 skb_reset_transport_header(frag);
 678                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
 679                                 __skb_push(frag, hlen);
 680                                 skb_reset_network_header(frag);
 681                                 memcpy(skb_network_header(frag), tmp_hdr,
 682                                        hlen);
 683                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 684                                 fh->nexthdr = nexthdr;
 685                                 fh->reserved = 0;
 686                                 fh->frag_off = htons(offset);
 687                                 if (frag->next)
 688                                         fh->frag_off |= htons(IP6_MF);
 689                                 fh->identification = frag_id;
 690                                 ipv6_hdr(frag)->payload_len =
 691                                                 htons(frag->len -
 692                                                       sizeof(struct ipv6hdr));
 693                                 ip6_copy_metadata(frag, skb);
 694                         }
 695
 696                         err = output(net, sk, skb);
 697                         if (!err)
 698                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 699                                               IPSTATS_MIB_FRAGCREATES);
 700
 701                         if (err || !frag)
 702                                 break;
 703
 704                         skb = frag;
 705                         frag = skb->next;
 706                         skb->next = NULL;
 707                 }
 708
 709                 kfree(tmp_hdr);
 710
 711                 if (err == 0) {
 712                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 713                                       IPSTATS_MIB_FRAGOKS);
 714                         ip6_rt_put(rt);
 715                         return 0;
 716                 }
 717
 718                 kfree_skb_list(frag);
 719
 720                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 721                               IPSTATS_MIB_FRAGFAILS);
 722                 ip6_rt_put(rt);
 723                 return err;
 724
 725 slow_path_clean:
 726                 skb_walk_frags(skb, frag2) {
 727                         if (frag2 == frag)
 728                                 break;
 729                         frag2->sk = NULL;
 730                         frag2->destructor = NULL;
 731                         skb->truesize += frag2->truesize;
 732                 }
 733         }
 734
 735 slow_path:
 736         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 737             skb_checksum_help(skb))
 738                 goto fail;
 739
 740         left = skb->len - hlen;         /* Space per frame */
 741         ptr = hlen;                     /* Where to start from */
 742
 743         /*
 744          *      Fragment the datagram.
 745          */
 746
 747         *prevhdr = NEXTHDR_FRAGMENT;
 748         troom = rt->dst.dev->needed_tailroom;
 749
 750         /*
 751          *      Keep copying data until we run out.
 752          */
 753         while (left > 0)        {
 754                 len = left;
 755                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 756                 if (len > mtu)
 757                         len = mtu;
 758                 /* IF: we are not sending up to and including the packet end
 759                    then align the next start on an eight byte boundary */
 760                 if (len < left) {
 761                         len &= ~7;
 762                 }
 763
 764                 /* Allocate buffer */
 765                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 766                                  hroom + troom, GFP_ATOMIC);
 767                 if (!frag) {
 768                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 769                                       IPSTATS_MIB_FRAGFAILS);
 770                         err = -ENOMEM;
 771                         goto fail;
 772                 }
 773
 774                 /*
 775                  *      Set up data on packet
 776                  */
 777
 778                 ip6_copy_metadata(frag, skb);
 779                 skb_reserve(frag, hroom);
 780                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 781                 skb_reset_network_header(frag);
 782                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 783                 frag->transport_header = (frag->network_header + hlen +
 784                                           sizeof(struct frag_hdr));
 785
 786                 /*
 787                  *      Charge the memory for the fragment to any owner
 788                  *      it might possess
 789                  */
 790                 if (skb->sk)
 791                         skb_set_owner_w(frag, skb->sk);
 792
 793                 /*
 794                  *      Copy the packet header into the new buffer.
 795                  */
 796                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 797
 798                 /*
 799                  *      Build fragment header.
 800                  */
 801                 fh->nexthdr = nexthdr;
 802                 fh->reserved = 0;
 803                 fh->identification = frag_id;
 804
 805                 /*
 806                  *      Copy a block of the IP datagram.
 807                  */
 808                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 809                                      len));
 810                 left -= len;
 811
 812                 fh->frag_off = htons(offset);
 813                 if (left > 0)
 814                         fh->frag_off |= htons(IP6_MF);
 815                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 816                                                     sizeof(struct ipv6hdr));
 817
 818                 ptr += len;
 819                 offset += len;
 820
 821                 /*
 822                  *      Put this fragment into the sending queue.
 823                  */
 824                 err = output(net, sk, frag);
 825                 if (err)
 826                         goto fail;
 827
 828                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 829                               IPSTATS_MIB_FRAGCREATES);
 830         }
 831         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 832                       IPSTATS_MIB_FRAGOKS);
 833         consume_skb(skb);
 834         return err;
 835
 836 fail_toobig:
 837         if (skb->sk && dst_allfrag(skb_dst(skb)))
 838                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 839
 840         skb->dev = skb_dst(skb)->dev;
 841         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 842         err = -EMSGSIZE;
 843
 844 fail:
 845         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 846                       IPSTATS_MIB_FRAGFAILS);
 847         kfree_skb(skb);
 848         return err;
 849 }
 850
 851 static inline int ip6_rt_check(const struct rt6key *rt_key,
 852                                const struct in6_addr *fl_addr,
 853                                const struct in6_addr *addr_cache)
 854 {
 855         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 856                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 857 }
 858
 859 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 860                                           struct dst_entry *dst,
 861                                           const struct flowi6 *fl6)
 862 {
 863         struct ipv6_pinfo *np = inet6_sk(sk);
 864         struct rt6_info *rt;
 865
 866         if (!dst)
 867                 goto out;
 868
 869         if (dst->ops->family != AF_INET6) {
 870                 dst_release(dst);
 871                 return NULL;
 872         }
 873
 874         rt = (struct rt6_info *)dst;
 875         /* Yes, checking route validity in not connected
 876          * case is not very simple. Take into account,
 877          * that we do not support routing by source, TOS,
 878          * and MSG_DONTROUTE            --ANK (980726)
 879          *
 880          * 1. ip6_rt_check(): If route was host route,
 881          *    check that cached destination is current.
 882          *    If it is network route, we still may
 883          *    check its validity using saved pointer
 884          *    to the last used address: daddr_cache.
 885          *    We do not want to save whole address now,
 886          *    (because main consumer of this service
 887          *    is tcp, which has not this problem),
 888          *    so that the last trick works only on connected
 889          *    sockets.
 890          * 2. oif also should be the same.
 891          */
 892         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 893 #ifdef CONFIG_IPV6_SUBTREES
 894             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 895 #endif
 896            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 897               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 898                 dst_release(dst);
 899                 dst = NULL;
 900         }
 901
 902 out:
 903         return dst;
 904 }
 905
 906 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 907                                struct dst_entry **dst, struct flowi6 *fl6)
 908 {
 909 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 910         struct neighbour *n;
 911         struct rt6_info *rt;
 912 #endif
 913         int err;
 914
 915         /* The correct way to handle this would be to do
 916          * ip6_route_get_saddr, and then ip6_route_output; however,
 917          * the route-specific preferred source forces the
 918          * ip6_route_output call _before_ ip6_route_get_saddr.
 919          *
 920          * In source specific routing (no src=any default route),
 921          * ip6_route_output will fail given src=any saddr, though, so
 922          * that's why we try it again later.
 923          */
 924         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 925                 struct rt6_info *rt;
 926                 bool had_dst = *dst != NULL;
 927
 928                 if (!had_dst)
 929                         *dst = ip6_route_output(net, sk, fl6);
 930                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 931                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 932                                           sk ? inet6_sk(sk)->srcprefs : 0,
 933                                           &fl6->saddr);
 934                 if (err)
 935                         goto out_err_release;
 936
 937                 /* If we had an erroneous initial result, pretend it
 938                  * never existed and let the SA-enabled version take
 939                  * over.
 940                  */
 941                 if (!had_dst && (*dst)->error) {
 942                         dst_release(*dst);
 943                         *dst = NULL;
 944                 }
 945         }
 946
 947         if (!*dst)
 948                 *dst = ip6_route_output(net, sk, fl6);
 949
 950         err = (*dst)->error;
 951         if (err)
 952                 goto out_err_release;
 953
 954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 955         /*
 956          * Here if the dst entry we've looked up
 957          * has a neighbour entry that is in the INCOMPLETE
 958          * state and the src address from the flow is
 959          * marked as OPTIMISTIC, we release the found
 960          * dst entry and replace it instead with the
 961          * dst entry of the nexthop router
 962          */
 963         rt = (struct rt6_info *) *dst;
 964         rcu_read_lock_bh();
 965         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 966                                       rt6_nexthop(rt, &fl6->daddr));
 967         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 968         rcu_read_unlock_bh();
 969
 970         if (err) {
 971                 struct inet6_ifaddr *ifp;
 972                 struct flowi6 fl_gw6;
 973                 int redirect;
 974
 975                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 976                                       (*dst)->dev, 1);
 977
 978                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 979                 if (ifp)
 980                         in6_ifa_put(ifp);
 981
 982                 if (redirect) {
 983                         /*
 984                          * We need to get the dst entry for the
 985                          * default router instead
 986                          */
 987                         dst_release(*dst);
 988                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
 989                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
 990                         *dst = ip6_route_output(net, sk, &fl_gw6);
 991                         err = (*dst)->error;
 992                         if (err)
 993                                 goto out_err_release;
 994                 }
 995         }
 996 #endif
 997
 998         return 0;
 999
1000 out_err_release:
1001         if (err == -ENETUNREACH)
1002                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1003         dst_release(*dst);
1004         *dst = NULL;
1005         return err;
1006 }
1007
1008 /**
1009  *      ip6_dst_lookup - perform route lookup on flow
1010  *      @sk: socket which provides route info
1011  *      @dst: pointer to dst_entry * for result
1012  *      @fl6: flow to lookup
1013  *
1014  *      This function performs a route lookup on the given flow.
1015  *
1016  *      It returns zero on success, or a standard errno code on error.
1017  */
1018 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1019                    struct flowi6 *fl6)
1020 {
1021         *dst = NULL;
1022         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1023 }
1024 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1025
1026 /**
1027  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1028  *      @sk: socket which provides route info
1029  *      @fl6: flow to lookup
1030  *      @final_dst: final destination address for ipsec lookup
1031  *
1032  *      This function performs a route lookup on the given flow.
1033  *
1034  *      It returns a valid dst pointer on success, or a pointer encoded
1035  *      error code.
1036  */
1037 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1038                                       const struct in6_addr *final_dst)
1039 {
1040         struct dst_entry *dst = NULL;
1041         int err;
1042
1043         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1044         if (err)
1045                 return ERR_PTR(err);
1046         if (final_dst)
1047                 fl6->daddr = *final_dst;
1048         if (!fl6->flowi6_oif)
1049                 fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1050
1051         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1052 }
1053 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1054
1055 /**
1056  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1057  *      @sk: socket which provides the dst cache and route info
1058  *      @fl6: flow to lookup
1059  *      @final_dst: final destination address for ipsec lookup
1060  *
1061  *      This function performs a route lookup on the given flow with the
1062  *      possibility of using the cached route in the socket if it is valid.
1063  *      It will take the socket dst lock when operating on the dst cache.
1064  *      As a result, this function can only be used in process context.
1065  *
1066  *      It returns a valid dst pointer on success, or a pointer encoded
1067  *      error code.
1068  */
1069 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1070                                          const struct in6_addr *final_dst)
1071 {
1072         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1073         int err;
1074
1075         dst = ip6_sk_dst_check(sk, dst, fl6);
1076
1077         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1078         if (err)
1079                 return ERR_PTR(err);
1080         if (final_dst)
1081                 fl6->daddr = *final_dst;
1082
1083         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1084 }
1085 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1086
1087 static inline int ip6_ufo_append_data(struct sock *sk,
1088                         struct sk_buff_head *queue,
1089                         int getfrag(void *from, char *to, int offset, int len,
1090                         int odd, struct sk_buff *skb),
1091                         void *from, int length, int hh_len, int fragheaderlen,
1092                         int transhdrlen, int mtu, unsigned int flags,
1093                         const struct flowi6 *fl6)
1094
1095 {
1096         struct sk_buff *skb;
1097         int err;
1098
1099         /* There is support for UDP large send offload by network
1100          * device, so create one single skb packet containing complete
1101          * udp datagram
1102          */
1103         skb = skb_peek_tail(queue);
1104         if (!skb) {
1105                 skb = sock_alloc_send_skb(sk,
1106                         hh_len + fragheaderlen + transhdrlen + 20,
1107                         (flags & MSG_DONTWAIT), &err);
1108                 if (!skb)
1109                         return err;
1110
1111                 /* reserve space for Hardware header */
1112                 skb_reserve(skb, hh_len);
1113
1114                 /* create space for UDP/IP header */
1115                 skb_put(skb, fragheaderlen + transhdrlen);
1116
1117                 /* initialize network header pointer */
1118                 skb_reset_network_header(skb);
1119
1120                 /* initialize protocol header pointer */
1121                 skb->transport_header = skb->network_header + fragheaderlen;
1122
1123                 skb->protocol = htons(ETH_P_IPV6);
1124                 skb->csum = 0;
1125
1126                 __skb_queue_tail(queue, skb);
1127         } else if (skb_is_gso(skb)) {
1128                 goto append;
1129         }
1130
1131         skb->ip_summed = CHECKSUM_PARTIAL;
1132         /* Specify the length of each IPv6 datagram fragment.
1133          * It has to be a multiple of 8.
1134          */
1135         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1136                                      sizeof(struct frag_hdr)) & ~7;
1137         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1138         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1139                                                          &fl6->daddr,
1140                                                          &fl6->saddr);
1141
1142 append:
1143         return skb_append_datato_frags(sk, skb, getfrag, from,
1144                                        (length - transhdrlen));
1145 }
1146
1147 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1148                                                gfp_t gfp)
1149 {
1150         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1151 }
1152
1153 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1154                                                 gfp_t gfp)
1155 {
1156         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1157 }
1158
1159 static void ip6_append_data_mtu(unsigned int *mtu,
1160                                 int *maxfraglen,
1161                                 unsigned int fragheaderlen,
1162                                 struct sk_buff *skb,
1163                                 struct rt6_info *rt,
1164                                 unsigned int orig_mtu)
1165 {
1166         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1167                 if (!skb) {
1168                         /* first fragment, reserve header_len */
1169                         *mtu = orig_mtu - rt->dst.header_len;
1170
1171                 } else {
1172                         /*
1173                          * this fragment is not first, the headers
1174                          * space is regarded as data space.
1175                          */
1176                         *mtu = orig_mtu;
1177                 }
1178                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1179                               + fragheaderlen - sizeof(struct frag_hdr);
1180         }
1181 }
1182
1183 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1184                           struct inet6_cork *v6_cork,
1185                           int hlimit, int tclass, struct ipv6_txoptions *opt,
1186                           struct rt6_info *rt, struct flowi6 *fl6)
1187 {
1188         struct ipv6_pinfo *np = inet6_sk(sk);
1189         unsigned int mtu;
1190
1191         /*
1192          * setup for corking
1193          */
1194         if (opt) {
1195                 if (WARN_ON(v6_cork->opt))
1196                         return -EINVAL;
1197
1198                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1199                 if (unlikely(!v6_cork->opt))
1200                         return -ENOBUFS;
1201
1202                 v6_cork->opt->tot_len = opt->tot_len;
1203                 v6_cork->opt->opt_flen = opt->opt_flen;
1204                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1205
1206                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1207                                                     sk->sk_allocation);
1208                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1209                         return -ENOBUFS;
1210
1211                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1212                                                     sk->sk_allocation);
1213                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1214                         return -ENOBUFS;
1215
1216                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1217                                                    sk->sk_allocation);
1218                 if (opt->hopopt && !v6_cork->opt->hopopt)
1219                         return -ENOBUFS;
1220
1221                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1222                                                     sk->sk_allocation);
1223                 if (opt->srcrt && !v6_cork->opt->srcrt)
1224                         return -ENOBUFS;
1225
1226                 /* need source address above miyazawa*/
1227         }
1228         dst_hold(&rt->dst);
1229         cork->base.dst = &rt->dst;
1230         cork->fl.u.ip6 = *fl6;
1231         v6_cork->hop_limit = hlimit;
1232         v6_cork->tclass = tclass;
1233         if (rt->dst.flags & DST_XFRM_TUNNEL)
1234                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1235                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1236         else
1237                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1238                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1239         if (np->frag_size < mtu) {
1240                 if (np->frag_size)
1241                         mtu = np->frag_size;
1242         }
1243         cork->base.fragsize = mtu;
1244         if (dst_allfrag(rt->dst.path))
1245                 cork->base.flags |= IPCORK_ALLFRAG;
1246         cork->base.length = 0;
1247
1248         return 0;
1249 }
1250
1251 static int __ip6_append_data(struct sock *sk,
1252                              struct flowi6 *fl6,
1253                              struct sk_buff_head *queue,
1254                              struct inet_cork *cork,
1255                              struct inet6_cork *v6_cork,
1256                              struct page_frag *pfrag,
1257                              int getfrag(void *from, char *to, int offset,
1258                                          int len, int odd, struct sk_buff *skb),
1259                              void *from, int length, int transhdrlen,
1260                              unsigned int flags, int dontfrag)
1261 {
1262         struct sk_buff *skb, *skb_prev = NULL;
1263         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1264         int exthdrlen = 0;
1265         int dst_exthdrlen = 0;
1266         int hh_len;
1267         int copy;
1268         int err;
1269         int offset = 0;
1270         __u8 tx_flags = 0;
1271         u32 tskey = 0;
1272         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1273         struct ipv6_txoptions *opt = v6_cork->opt;
1274         int csummode = CHECKSUM_NONE;
1275
1276         skb = skb_peek_tail(queue);
1277         if (!skb) {
1278                 exthdrlen = opt ? opt->opt_flen : 0;
1279                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1280         }
1281
1282         mtu = cork->fragsize;
1283         orig_mtu = mtu;
1284
1285         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1286
1287         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1288                         (opt ? opt->opt_nflen : 0);
1289         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1290                      sizeof(struct frag_hdr);
1291
1292         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1293                 unsigned int maxnonfragsize, headersize;
1294
1295                 headersize = sizeof(struct ipv6hdr) +
1296                              (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1297                              (dst_allfrag(&rt->dst) ?
1298                               sizeof(struct frag_hdr) : 0) +
1299                              rt->rt6i_nfheader_len;
1300
1301                 if (ip6_sk_ignore_df(sk))
1302                         maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1303                 else
1304                         maxnonfragsize = mtu;
1305
1306                 /* dontfrag active */
1307                 if ((cork->length + length > mtu - headersize) && dontfrag &&
1308                     (sk->sk_protocol == IPPROTO_UDP ||
1309                      sk->sk_protocol == IPPROTO_RAW)) {
1310                         ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1311                                                    sizeof(struct ipv6hdr));
1312                         goto emsgsize;
1313                 }
1314
1315                 if (cork->length + length > maxnonfragsize - headersize) {
1316 emsgsize:
1317                         ipv6_local_error(sk, EMSGSIZE, fl6,
1318                                          mtu - headersize +
1319                                          sizeof(struct ipv6hdr));
1320                         return -EMSGSIZE;
1321                 }
1322         }
1323
1324         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1325                 sock_tx_timestamp(sk, &tx_flags);
1326                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1327                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1328                         tskey = sk->sk_tskey++;
1329         }
1330
1331         /* If this is the first and only packet and device
1332          * supports checksum offloading, let's use it.
1333          * Use transhdrlen, same as IPv4, because partial
1334          * sums only work when transhdrlen is set.
1335          */
1336         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1337             length + fragheaderlen < mtu &&
1338             rt->dst.dev->features & NETIF_F_V6_CSUM &&
1339             !exthdrlen)
1340                 csummode = CHECKSUM_PARTIAL;
1341         /*
1342          * Let's try using as much space as possible.
1343          * Use MTU if total length of the message fits into the MTU.
1344          * Otherwise, we need to reserve fragment header and
1345          * fragment alignment (= 8-15 octects, in total).
1346          *
1347          * Note that we may need to "move" the data from the tail of
1348          * of the buffer to the new fragment when we split
1349          * the message.
1350          *
1351          * FIXME: It may be fragmented into multiple chunks
1352          *        at once if non-fragmentable extension headers
1353          *        are too large.
1354          * --yoshfuji
1355          */
1356
1357         cork->length += length;
1358         if (((length > mtu) ||
1359              (skb && skb_is_gso(skb))) &&
1360             (sk->sk_protocol == IPPROTO_UDP) &&
1361             (rt->dst.dev->features & NETIF_F_UFO) &&
1362             (sk->sk_type == SOCK_DGRAM)) {
1363                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1364                                           hh_len, fragheaderlen,
1365                                           transhdrlen, mtu, flags, fl6);
1366                 if (err)
1367                         goto error;
1368                 return 0;
1369         }
1370
1371         if (!skb)
1372                 goto alloc_new_skb;
1373
1374         while (length > 0) {
1375                 /* Check if the remaining data fits into current packet. */
1376                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1377                 if (copy < length)
1378                         copy = maxfraglen - skb->len;
1379
1380                 if (copy <= 0) {
1381                         char *data;
1382                         unsigned int datalen;
1383                         unsigned int fraglen;
1384                         unsigned int fraggap;
1385                         unsigned int alloclen;
1386 alloc_new_skb:
1387                         /* There's no room in the current skb */
1388                         if (skb)
1389                                 fraggap = skb->len - maxfraglen;
1390                         else
1391                                 fraggap = 0;
1392                         /* update mtu and maxfraglen if necessary */
1393                         if (!skb || !skb_prev)
1394                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1395                                                     fragheaderlen, skb, rt,
1396                                                     orig_mtu);
1397
1398                         skb_prev = skb;
1399
1400                         /*
1401                          * If remaining data exceeds the mtu,
1402                          * we know we need more fragment(s).
1403                          */
1404                         datalen = length + fraggap;
1405
1406                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1407                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1408                         if ((flags & MSG_MORE) &&
1409                             !(rt->dst.dev->features&NETIF_F_SG))
1410                                 alloclen = mtu;
1411                         else
1412                                 alloclen = datalen + fragheaderlen;
1413
1414                         alloclen += dst_exthdrlen;
1415
1416                         if (datalen != length + fraggap) {
1417                                 /*
1418                                  * this is not the last fragment, the trailer
1419                                  * space is regarded as data space.
1420                                  */
1421                                 datalen += rt->dst.trailer_len;
1422                         }
1423
1424                         alloclen += rt->dst.trailer_len;
1425                         fraglen = datalen + fragheaderlen;
1426
1427                         /*
1428                          * We just reserve space for fragment header.
1429                          * Note: this may be overallocation if the message
1430                          * (without MSG_MORE) fits into the MTU.
1431                          */
1432                         alloclen += sizeof(struct frag_hdr);
1433
1434                         if (transhdrlen) {
1435                                 skb = sock_alloc_send_skb(sk,
1436                                                 alloclen + hh_len,
1437                                                 (flags & MSG_DONTWAIT), &err);
1438                         } else {
1439                                 skb = NULL;
1440                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1441                                     2 * sk->sk_sndbuf)
1442                                         skb = sock_wmalloc(sk,
1443                                                            alloclen + hh_len, 1,
1444                                                            sk->sk_allocation);
1445                                 if (unlikely(!skb))
1446                                         err = -ENOBUFS;
1447                         }
1448                         if (!skb)
1449                                 goto error;
1450                         /*
1451                          *      Fill in the control structures
1452                          */
1453                         skb->protocol = htons(ETH_P_IPV6);
1454                         skb->ip_summed = csummode;
1455                         skb->csum = 0;
1456                         /* reserve for fragmentation and ipsec header */
1457                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1458                                     dst_exthdrlen);
1459
1460                         /* Only the initial fragment is time stamped */
1461                         skb_shinfo(skb)->tx_flags = tx_flags;
1462                         tx_flags = 0;
1463                         skb_shinfo(skb)->tskey = tskey;
1464                         tskey = 0;
1465
1466                         /*
1467                          *      Find where to start putting bytes
1468                          */
1469                         data = skb_put(skb, fraglen);
1470                         skb_set_network_header(skb, exthdrlen);
1471                         data += fragheaderlen;
1472                         skb->transport_header = (skb->network_header +
1473                                                  fragheaderlen);
1474                         if (fraggap) {
1475                                 skb->csum = skb_copy_and_csum_bits(
1476                                         skb_prev, maxfraglen,
1477                                         data + transhdrlen, fraggap, 0);
1478                                 skb_prev->csum = csum_sub(skb_prev->csum,
1479                                                           skb->csum);
1480                                 data += fraggap;
1481                                 pskb_trim_unique(skb_prev, maxfraglen);
1482                         }
1483                         copy = datalen - transhdrlen - fraggap;
1484
1485                         if (copy < 0) {
1486                                 err = -EINVAL;
1487                                 kfree_skb(skb);
1488                                 goto error;
1489                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1490                                 err = -EFAULT;
1491                                 kfree_skb(skb);
1492                                 goto error;
1493                         }
1494
1495                         offset += copy;
1496                         length -= datalen - fraggap;
1497                         transhdrlen = 0;
1498                         exthdrlen = 0;
1499                         dst_exthdrlen = 0;
1500
1501                         /*
1502                          * Put the packet on the pending queue
1503                          */
1504                         __skb_queue_tail(queue, skb);
1505                         continue;
1506                 }
1507
1508                 if (copy > length)
1509                         copy = length;
1510
1511                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1512                         unsigned int off;
1513
1514                         off = skb->len;
1515                         if (getfrag(from, skb_put(skb, copy),
1516                                                 offset, copy, off, skb) < 0) {
1517                                 __skb_trim(skb, off);
1518                                 err = -EFAULT;
1519                                 goto error;
1520                         }
1521                 } else {
1522                         int i = skb_shinfo(skb)->nr_frags;
1523
1524                         err = -ENOMEM;
1525                         if (!sk_page_frag_refill(sk, pfrag))
1526                                 goto error;
1527
1528                         if (!skb_can_coalesce(skb, i, pfrag->page,
1529                                               pfrag->offset)) {
1530                                 err = -EMSGSIZE;
1531                                 if (i == MAX_SKB_FRAGS)
1532                                         goto error;
1533
1534                                 __skb_fill_page_desc(skb, i, pfrag->page,
1535                                                      pfrag->offset, 0);
1536                                 skb_shinfo(skb)->nr_frags = ++i;
1537                                 get_page(pfrag->page);
1538                         }
1539                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1540                         if (getfrag(from,
1541                                     page_address(pfrag->page) + pfrag->offset,
1542                                     offset, copy, skb->len, skb) < 0)
1543                                 goto error_efault;
1544
1545                         pfrag->offset += copy;
1546                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1547                         skb->len += copy;
1548                         skb->data_len += copy;
1549                         skb->truesize += copy;
1550                         atomic_add(copy, &sk->sk_wmem_alloc);
1551                 }
1552                 offset += copy;
1553                 length -= copy;
1554         }
1555
1556         return 0;
1557
1558 error_efault:
1559         err = -EFAULT;
1560 error:
1561         cork->length -= length;
1562         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1563         return err;
1564 }
1565
1566 int ip6_append_data(struct sock *sk,
1567                     int getfrag(void *from, char *to, int offset, int len,
1568                                 int odd, struct sk_buff *skb),
1569                     void *from, int length, int transhdrlen, int hlimit,
1570                     int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1571                     struct rt6_info *rt, unsigned int flags, int dontfrag)
1572 {
1573         struct inet_sock *inet = inet_sk(sk);
1574         struct ipv6_pinfo *np = inet6_sk(sk);
1575         int exthdrlen;
1576         int err;
1577
1578         if (flags&MSG_PROBE)
1579                 return 0;
1580         if (skb_queue_empty(&sk->sk_write_queue)) {
1581                 /*
1582                  * setup for corking
1583                  */
1584                 err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1585                                      tclass, opt, rt, fl6);
1586                 if (err)
1587                         return err;
1588
1589                 exthdrlen = (opt ? opt->opt_flen : 0);
1590                 length += exthdrlen;
1591                 transhdrlen += exthdrlen;
1592         } else {
1593                 fl6 = &inet->cork.fl.u.ip6;
1594                 transhdrlen = 0;
1595         }
1596
1597         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1598                                  &np->cork, sk_page_frag(sk), getfrag,
1599                                  from, length, transhdrlen, flags, dontfrag);
1600 }
1601 EXPORT_SYMBOL_GPL(ip6_append_data);
1602
1603 static void ip6_cork_release(struct inet_cork_full *cork,
1604                              struct inet6_cork *v6_cork)
1605 {
1606         if (v6_cork->opt) {
1607                 kfree(v6_cork->opt->dst0opt);
1608                 kfree(v6_cork->opt->dst1opt);
1609                 kfree(v6_cork->opt->hopopt);
1610                 kfree(v6_cork->opt->srcrt);
1611                 kfree(v6_cork->opt);
1612                 v6_cork->opt = NULL;
1613         }
1614
1615         if (cork->base.dst) {
1616                 dst_release(cork->base.dst);
1617                 cork->base.dst = NULL;
1618                 cork->base.flags &= ~IPCORK_ALLFRAG;
1619         }
1620         memset(&cork->fl, 0, sizeof(cork->fl));
1621 }
1622
1623 struct sk_buff *__ip6_make_skb(struct sock *sk,
1624                                struct sk_buff_head *queue,
1625                                struct inet_cork_full *cork,
1626                                struct inet6_cork *v6_cork)
1627 {
1628         struct sk_buff *skb, *tmp_skb;
1629         struct sk_buff **tail_skb;
1630         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1631         struct ipv6_pinfo *np = inet6_sk(sk);
1632         struct net *net = sock_net(sk);
1633         struct ipv6hdr *hdr;
1634         struct ipv6_txoptions *opt = v6_cork->opt;
1635         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1636         struct flowi6 *fl6 = &cork->fl.u.ip6;
1637         unsigned char proto = fl6->flowi6_proto;
1638
1639         skb = __skb_dequeue(queue);
1640         if (!skb)
1641                 goto out;
1642         tail_skb = &(skb_shinfo(skb)->frag_list);
1643
1644         /* move skb->data to ip header from ext header */
1645         if (skb->data < skb_network_header(skb))
1646                 __skb_pull(skb, skb_network_offset(skb));
1647         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1648                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1649                 *tail_skb = tmp_skb;
1650                 tail_skb = &(tmp_skb->next);
1651                 skb->len += tmp_skb->len;
1652                 skb->data_len += tmp_skb->len;
1653                 skb->truesize += tmp_skb->truesize;
1654                 tmp_skb->destructor = NULL;
1655                 tmp_skb->sk = NULL;
1656         }
1657
1658         /* Allow local fragmentation. */
1659         skb->ignore_df = ip6_sk_ignore_df(sk);
1660
1661         *final_dst = fl6->daddr;
1662         __skb_pull(skb, skb_network_header_len(skb));
1663         if (opt && opt->opt_flen)
1664                 ipv6_push_frag_opts(skb, opt, &proto);
1665         if (opt && opt->opt_nflen)
1666                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1667
1668         skb_push(skb, sizeof(struct ipv6hdr));
1669         skb_reset_network_header(skb);
1670         hdr = ipv6_hdr(skb);
1671
1672         ip6_flow_hdr(hdr, v6_cork->tclass,
1673                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1674                                         np->autoflowlabel, fl6));
1675         hdr->hop_limit = v6_cork->hop_limit;
1676         hdr->nexthdr = proto;
1677         hdr->saddr = fl6->saddr;
1678         hdr->daddr = *final_dst;
1679
1680         skb->priority = sk->sk_priority;
1681         skb->mark = sk->sk_mark;
1682
1683         skb_dst_set(skb, dst_clone(&rt->dst));
1684         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1685         if (proto == IPPROTO_ICMPV6) {
1686                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1687
1688                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1689                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1690         }
1691
1692         ip6_cork_release(cork, v6_cork);
1693 out:
1694         return skb;
1695 }
1696
1697 int ip6_send_skb(struct sk_buff *skb)
1698 {
1699         struct net *net = sock_net(skb->sk);
1700         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1701         int err;
1702
1703         err = ip6_local_out(net, skb->sk, skb);
1704         if (err) {
1705                 if (err > 0)
1706                         err = net_xmit_errno(err);
1707                 if (err)
1708                         IP6_INC_STATS(net, rt->rt6i_idev,
1709                                       IPSTATS_MIB_OUTDISCARDS);
1710         }
1711
1712         return err;
1713 }
1714
1715 int ip6_push_pending_frames(struct sock *sk)
1716 {
1717         struct sk_buff *skb;
1718
1719         skb = ip6_finish_skb(sk);
1720         if (!skb)
1721                 return 0;
1722
1723         return ip6_send_skb(skb);
1724 }
1725 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1726
1727 static void __ip6_flush_pending_frames(struct sock *sk,
1728                                        struct sk_buff_head *queue,
1729                                        struct inet_cork_full *cork,
1730                                        struct inet6_cork *v6_cork)
1731 {
1732         struct sk_buff *skb;
1733
1734         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1735                 if (skb_dst(skb))
1736                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1737                                       IPSTATS_MIB_OUTDISCARDS);
1738                 kfree_skb(skb);
1739         }
1740
1741         ip6_cork_release(cork, v6_cork);
1742 }
1743
1744 void ip6_flush_pending_frames(struct sock *sk)
1745 {
1746         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1747                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1748 }
1749 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1750
1751 struct sk_buff *ip6_make_skb(struct sock *sk,
1752                              int getfrag(void *from, char *to, int offset,
1753                                          int len, int odd, struct sk_buff *skb),
1754                              void *from, int length, int transhdrlen,
1755                              int hlimit, int tclass,
1756                              struct ipv6_txoptions *opt, struct flowi6 *fl6,
1757                              struct rt6_info *rt, unsigned int flags,
1758                              int dontfrag)
1759 {
1760         struct inet_cork_full cork;
1761         struct inet6_cork v6_cork;
1762         struct sk_buff_head queue;
1763         int exthdrlen = (opt ? opt->opt_flen : 0);
1764         int err;
1765
1766         if (flags & MSG_PROBE)
1767                 return NULL;
1768
1769         __skb_queue_head_init(&queue);
1770
1771         cork.base.flags = 0;
1772         cork.base.addr = 0;
1773         cork.base.opt = NULL;
1774         v6_cork.opt = NULL;
1775         err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1776         if (err)
1777                 return ERR_PTR(err);
1778
1779         if (dontfrag < 0)
1780                 dontfrag = inet6_sk(sk)->dontfrag;
1781
1782         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1783                                 &current->task_frag, getfrag, from,
1784                                 length + exthdrlen, transhdrlen + exthdrlen,
1785                                 flags, dontfrag);
1786         if (err) {
1787                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1788                 return ERR_PTR(err);
1789         }
1790
1791         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1792 }