net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  61 {
  62         static u32 ipv6_fragmentation_id = 1;
  63         static DEFINE_SPINLOCK(ip6_id_lock);
  64
  65         spin_lock_bh(&ip6_id_lock);
  66         fhdr->identification = htonl(ipv6_fragmentation_id);
  67         if (++ipv6_fragmentation_id == 0)
  68                 ipv6_fragmentation_id = 1;
  69         spin_unlock_bh(&ip6_id_lock);
  70 }
  71
  72 int __ip6_local_out(struct sk_buff *skb)
  73 {
  74         int len;
  75
  76         len = skb->len - sizeof(struct ipv6hdr);
  77         if (len > IPV6_MAXPLEN)
  78                 len = 0;
  79         ipv6_hdr(skb)->payload_len = htons(len);
  80
  81         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  82                        dst_output);
  83 }
  84
  85 int ip6_local_out(struct sk_buff *skb)
  86 {
  87         int err;
  88
  89         err = __ip6_local_out(skb);
  90         if (likely(err == 1))
  91                 err = dst_output(skb);
  92
  93         return err;
  94 }
  95 EXPORT_SYMBOL_GPL(ip6_local_out);
  96
  97 static int ip6_output_finish(struct sk_buff *skb)
  98 {
  99         struct dst_entry *dst = skb->dst;
 100
 101         if (dst->hh)
 102                 return neigh_hh_output(dst->hh, skb);
 103         else if (dst->neighbour)
 104                 return dst->neighbour->output(skb);
 105
 106         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 107         kfree_skb(skb);
 108         return -EINVAL;
 109
 110 }
 111
 112 /* dev_loopback_xmit for use with netfilter. */
 113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 114 {
 115         skb_reset_mac_header(newskb);
 116         __skb_pull(newskb, skb_network_offset(newskb));
 117         newskb->pkt_type = PACKET_LOOPBACK;
 118         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 119         BUG_TRAP(newskb->dst);
 120
 121         netif_rx(newskb);
 122         return 0;
 123 }
 124
 125
 126 static int ip6_output2(struct sk_buff *skb)
 127 {
 128         struct dst_entry *dst = skb->dst;
 129         struct net_device *dev = dst->dev;
 130
 131         skb->protocol = htons(ETH_P_IPV6);
 132         skb->dev = dev;
 133
 134         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 135                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 136                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 137
 138                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 139                     ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 140                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 141                                          &ipv6_hdr(skb)->saddr))) {
 142                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 143
 144                         /* Do not check for IFF_ALLMULTI; multicast routing
 145                            is not supported in any case.
 146                          */
 147                         if (newskb)
 148                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 149                                         NULL, newskb->dev,
 150                                         ip6_dev_loopback_xmit);
 151
 152                         if (ipv6_hdr(skb)->hop_limit == 0) {
 153                                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
 154                                 kfree_skb(skb);
 155                                 return 0;
 156                         }
 157                 }
 158
 159                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
 160         }
 161
 162         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 163                        ip6_output_finish);
 164 }
 165
 166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 167 {
 168         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 169
 170         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 171                skb->dst->dev->mtu : dst_mtu(skb->dst);
 172 }
 173
 174 int ip6_output(struct sk_buff *skb)
 175 {
 176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 177                                 dst_allfrag(skb->dst))
 178                 return ip6_fragment(skb, ip6_output2);
 179         else
 180                 return ip6_output2(skb);
 181 }
 182
 183 /*
 184  *      xmit an sk_buff (used by TCP)
 185  */
 186
 187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 188              struct ipv6_txoptions *opt, int ipfragok)
 189 {
 190         struct ipv6_pinfo *np = inet6_sk(sk);
 191         struct in6_addr *first_hop = &fl->fl6_dst;
 192         struct dst_entry *dst = skb->dst;
 193         struct ipv6hdr *hdr;
 194         u8  proto = fl->proto;
 195         int seg_len = skb->len;
 196         int hlimit, tclass;
 197         u32 mtu;
 198
 199         if (opt) {
 200                 unsigned int head_room;
 201
 202                 /* First: exthdrs may take lots of space (~8K for now)
 203                    MAX_HEADER is not enough.
 204                  */
 205                 head_room = opt->opt_nflen + opt->opt_flen;
 206                 seg_len += head_room;
 207                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 208
 209                 if (skb_headroom(skb) < head_room) {
 210                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                         if (skb2 == NULL) {
 212                                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 213                                               IPSTATS_MIB_OUTDISCARDS);
 214                                 kfree_skb(skb);
 215                                 return -ENOBUFS;
 216                         }
 217                         kfree_skb(skb);
 218                         skb = skb2;
 219                         if (sk)
 220                                 skb_set_owner_w(skb, sk);
 221                 }
 222                 if (opt->opt_flen)
 223                         ipv6_push_frag_opts(skb, opt, &proto);
 224                 if (opt->opt_nflen)
 225                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 226         }
 227
 228         skb_push(skb, sizeof(struct ipv6hdr));
 229         skb_reset_network_header(skb);
 230         hdr = ipv6_hdr(skb);
 231
 232         /*
 233          *      Fill in the IPv6 header
 234          */
 235
 236         hlimit = -1;
 237         if (np)
 238                 hlimit = np->hop_limit;
 239         if (hlimit < 0)
 240                 hlimit = ip6_dst_hoplimit(dst);
 241
 242         tclass = -1;
 243         if (np)
 244                 tclass = np->tclass;
 245         if (tclass < 0)
 246                 tclass = 0;
 247
 248         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 249
 250         hdr->payload_len = htons(seg_len);
 251         hdr->nexthdr = proto;
 252         hdr->hop_limit = hlimit;
 253
 254         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 255         ipv6_addr_copy(&hdr->daddr, first_hop);
 256
 257         skb->priority = sk->sk_priority;
 258         skb->mark = sk->sk_mark;
 259
 260         mtu = dst_mtu(dst);
 261         if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
 262                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 263                               IPSTATS_MIB_OUTREQUESTS);
 264                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 265                                 dst_output);
 266         }
 267
 268         if (net_ratelimit())
 269                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 270         skb->dev = dst->dev;
 271         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 272         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 273         kfree_skb(skb);
 274         return -EMSGSIZE;
 275 }
 276
 277 EXPORT_SYMBOL(ip6_xmit);
 278
 279 /*
 280  *      To avoid extra problems ND packets are send through this
 281  *      routine. It's code duplication but I really want to avoid
 282  *      extra checks since ipv6_build_header is used by TCP (which
 283  *      is for us performance critical)
 284  */
 285
 286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 287                const struct in6_addr *saddr, const struct in6_addr *daddr,
 288                int proto, int len)
 289 {
 290         struct ipv6_pinfo *np = inet6_sk(sk);
 291         struct ipv6hdr *hdr;
 292         int totlen;
 293
 294         skb->protocol = htons(ETH_P_IPV6);
 295         skb->dev = dev;
 296
 297         totlen = len + sizeof(struct ipv6hdr);
 298
 299         skb_reset_network_header(skb);
 300         skb_put(skb, sizeof(struct ipv6hdr));
 301         hdr = ipv6_hdr(skb);
 302
 303         *(__be32*)hdr = htonl(0x60000000);
 304
 305         hdr->payload_len = htons(len);
 306         hdr->nexthdr = proto;
 307         hdr->hop_limit = np->hop_limit;
 308
 309         ipv6_addr_copy(&hdr->saddr, saddr);
 310         ipv6_addr_copy(&hdr->daddr, daddr);
 311
 312         return 0;
 313 }
 314
 315 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 316 {
 317         struct ip6_ra_chain *ra;
 318         struct sock *last = NULL;
 319
 320         read_lock(&ip6_ra_lock);
 321         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 322                 struct sock *sk = ra->sk;
 323                 if (sk && ra->sel == sel &&
 324                     (!sk->sk_bound_dev_if ||
 325                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 326                         if (last) {
 327                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 328                                 if (skb2)
 329                                         rawv6_rcv(last, skb2);
 330                         }
 331                         last = sk;
 332                 }
 333         }
 334
 335         if (last) {
 336                 rawv6_rcv(last, skb);
 337                 read_unlock(&ip6_ra_lock);
 338                 return 1;
 339         }
 340         read_unlock(&ip6_ra_lock);
 341         return 0;
 342 }
 343
 344 static int ip6_forward_proxy_check(struct sk_buff *skb)
 345 {
 346         struct ipv6hdr *hdr = ipv6_hdr(skb);
 347         u8 nexthdr = hdr->nexthdr;
 348         int offset;
 349
 350         if (ipv6_ext_hdr(nexthdr)) {
 351                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 352                 if (offset < 0)
 353                         return 0;
 354         } else
 355                 offset = sizeof(struct ipv6hdr);
 356
 357         if (nexthdr == IPPROTO_ICMPV6) {
 358                 struct icmp6hdr *icmp6;
 359
 360                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 361                                          offset + 1 - skb->data)))
 362                         return 0;
 363
 364                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 365
 366                 switch (icmp6->icmp6_type) {
 367                 case NDISC_ROUTER_SOLICITATION:
 368                 case NDISC_ROUTER_ADVERTISEMENT:
 369                 case NDISC_NEIGHBOUR_SOLICITATION:
 370                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 371                 case NDISC_REDIRECT:
 372                         /* For reaction involving unicast neighbor discovery
 373                          * message destined to the proxied address, pass it to
 374                          * input function.
 375                          */
 376                         return 1;
 377                 default:
 378                         break;
 379                 }
 380         }
 381
 382         /*
 383          * The proxying router can't forward traffic sent to a link-local
 384          * address, so signal the sender and discard the packet. This
 385          * behavior is clarified by the MIPv6 specification.
 386          */
 387         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 388                 dst_link_failure(skb);
 389                 return -1;
 390         }
 391
 392         return 0;
 393 }
 394
 395 static inline int ip6_forward_finish(struct sk_buff *skb)
 396 {
 397         return dst_output(skb);
 398 }
 399
 400 int ip6_forward(struct sk_buff *skb)
 401 {
 402         struct dst_entry *dst = skb->dst;
 403         struct ipv6hdr *hdr = ipv6_hdr(skb);
 404         struct inet6_skb_parm *opt = IP6CB(skb);
 405         struct net *net = dev_net(dst->dev);
 406
 407         if (ipv6_devconf.forwarding == 0)
 408                 goto error;
 409
 410         if (skb_warn_if_lro(skb))
 411                 goto drop;
 412
 413         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 414                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 415                 goto drop;
 416         }
 417
 418         skb_forward_csum(skb);
 419
 420         /*
 421          *      We DO NOT make any processing on
 422          *      RA packets, pushing them to user level AS IS
 423          *      without ane WARRANTY that application will be able
 424          *      to interpret them. The reason is that we
 425          *      cannot make anything clever here.
 426          *
 427          *      We are not end-node, so that if packet contains
 428          *      AH/ESP, we cannot make anything.
 429          *      Defragmentation also would be mistake, RA packets
 430          *      cannot be fragmented, because there is no warranty
 431          *      that different fragments will go along one path. --ANK
 432          */
 433         if (opt->ra) {
 434                 u8 *ptr = skb_network_header(skb) + opt->ra;
 435                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 436                         return 0;
 437         }
 438
 439         /*
 440          *      check and decrement ttl
 441          */
 442         if (hdr->hop_limit <= 1) {
 443                 /* Force OUTPUT device used as source address */
 444                 skb->dev = dst->dev;
 445                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 446                             0, skb->dev);
 447                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 448
 449                 kfree_skb(skb);
 450                 return -ETIMEDOUT;
 451         }
 452
 453         /* XXX: idev->cnf.proxy_ndp? */
 454         if (ipv6_devconf.proxy_ndp &&
 455             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 456                 int proxied = ip6_forward_proxy_check(skb);
 457                 if (proxied > 0)
 458                         return ip6_input(skb);
 459                 else if (proxied < 0) {
 460                         IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 461                         goto drop;
 462                 }
 463         }
 464
 465         if (!xfrm6_route_forward(skb)) {
 466                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 467                 goto drop;
 468         }
 469         dst = skb->dst;
 470
 471         /* IPv6 specs say nothing about it, but it is clear that we cannot
 472            send redirects to source routed frames.
 473            We don't send redirects to frames decapsulated from IPsec.
 474          */
 475         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 476             !skb->sp) {
 477                 struct in6_addr *target = NULL;
 478                 struct rt6_info *rt;
 479                 struct neighbour *n = dst->neighbour;
 480
 481                 /*
 482                  *      incoming and outgoing devices are the same
 483                  *      send a redirect.
 484                  */
 485
 486                 rt = (struct rt6_info *) dst;
 487                 if ((rt->rt6i_flags & RTF_GATEWAY))
 488                         target = (struct in6_addr*)&n->primary_key;
 489                 else
 490                         target = &hdr->daddr;
 491
 492                 /* Limit redirects both by destination (here)
 493                    and by source (inside ndisc_send_redirect)
 494                  */
 495                 if (xrlim_allow(dst, 1*HZ))
 496                         ndisc_send_redirect(skb, n, target);
 497         } else {
 498                 int addrtype = ipv6_addr_type(&hdr->saddr);
 499
 500                 /* This check is security critical. */
 501                 if (addrtype == IPV6_ADDR_ANY ||
 502                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 503                         goto error;
 504                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 505                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 506                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 507                         goto error;
 508                 }
 509         }
 510
 511         if (skb->len > dst_mtu(dst)) {
 512                 /* Again, force OUTPUT device used as source address */
 513                 skb->dev = dst->dev;
 514                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 515                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 516                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 517                 kfree_skb(skb);
 518                 return -EMSGSIZE;
 519         }
 520
 521         if (skb_cow(skb, dst->dev->hard_header_len)) {
 522                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 523                 goto drop;
 524         }
 525
 526         hdr = ipv6_hdr(skb);
 527
 528         /* Mangling hops number delayed to point after skb COW */
 529
 530         hdr->hop_limit--;
 531
 532         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 533         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 534                        ip6_forward_finish);
 535
 536 error:
 537         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 538 drop:
 539         kfree_skb(skb);
 540         return -EINVAL;
 541 }
 542
 543 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 544 {
 545         to->pkt_type = from->pkt_type;
 546         to->priority = from->priority;
 547         to->protocol = from->protocol;
 548         dst_release(to->dst);
 549         to->dst = dst_clone(from->dst);
 550         to->dev = from->dev;
 551         to->mark = from->mark;
 552
 553 #ifdef CONFIG_NET_SCHED
 554         to->tc_index = from->tc_index;
 555 #endif
 556         nf_copy(to, from);
 557 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 558     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 559         to->nf_trace = from->nf_trace;
 560 #endif
 561         skb_copy_secmark(to, from);
 562 }
 563
 564 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 565 {
 566         u16 offset = sizeof(struct ipv6hdr);
 567         struct ipv6_opt_hdr *exthdr =
 568                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 569         unsigned int packet_len = skb->tail - skb->network_header;
 570         int found_rhdr = 0;
 571         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 572
 573         while (offset + 1 <= packet_len) {
 574
 575                 switch (**nexthdr) {
 576
 577                 case NEXTHDR_HOP:
 578                         break;
 579                 case NEXTHDR_ROUTING:
 580                         found_rhdr = 1;
 581                         break;
 582                 case NEXTHDR_DEST:
 583 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 584                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 585                                 break;
 586 #endif
 587                         if (found_rhdr)
 588                                 return offset;
 589                         break;
 590                 default :
 591                         return offset;
 592                 }
 593
 594                 offset += ipv6_optlen(exthdr);
 595                 *nexthdr = &exthdr->nexthdr;
 596                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 597                                                  offset);
 598         }
 599
 600         return offset;
 601 }
 602
 603 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 604 {
 605         struct net_device *dev;
 606         struct sk_buff *frag;
 607         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 608         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 609         struct ipv6hdr *tmp_hdr;
 610         struct frag_hdr *fh;
 611         unsigned int mtu, hlen, left, len;
 612         __be32 frag_id = 0;
 613         int ptr, offset = 0, err=0;
 614         u8 *prevhdr, nexthdr = 0;
 615
 616         dev = rt->u.dst.dev;
 617         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 618         nexthdr = *prevhdr;
 619
 620         mtu = ip6_skb_dst_mtu(skb);
 621
 622         /* We must not fragment if the socket is set to force MTU discovery
 623          * or if the skb it not generated by a local socket.  (This last
 624          * check should be redundant, but it's free.)
 625          */
 626         if (!skb->local_df) {
 627                 skb->dev = skb->dst->dev;
 628                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 629                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 630                 kfree_skb(skb);
 631                 return -EMSGSIZE;
 632         }
 633
 634         if (np && np->frag_size < mtu) {
 635                 if (np->frag_size)
 636                         mtu = np->frag_size;
 637         }
 638         mtu -= hlen + sizeof(struct frag_hdr);
 639
 640         if (skb_shinfo(skb)->frag_list) {
 641                 int first_len = skb_pagelen(skb);
 642                 int truesizes = 0;
 643
 644                 if (first_len - hlen > mtu ||
 645                     ((first_len - hlen) & 7) ||
 646                     skb_cloned(skb))
 647                         goto slow_path;
 648
 649                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 650                         /* Correct geometry. */
 651                         if (frag->len > mtu ||
 652                             ((frag->len & 7) && frag->next) ||
 653                             skb_headroom(frag) < hlen)
 654                             goto slow_path;
 655
 656                         /* Partially cloned skb? */
 657                         if (skb_shared(frag))
 658                                 goto slow_path;
 659
 660                         BUG_ON(frag->sk);
 661                         if (skb->sk) {
 662                                 sock_hold(skb->sk);
 663                                 frag->sk = skb->sk;
 664                                 frag->destructor = sock_wfree;
 665                                 truesizes += frag->truesize;
 666                         }
 667                 }
 668
 669                 err = 0;
 670                 offset = 0;
 671                 frag = skb_shinfo(skb)->frag_list;
 672                 skb_shinfo(skb)->frag_list = NULL;
 673                 /* BUILD HEADER */
 674
 675                 *prevhdr = NEXTHDR_FRAGMENT;
 676                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 677                 if (!tmp_hdr) {
 678                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 679                         return -ENOMEM;
 680                 }
 681
 682                 __skb_pull(skb, hlen);
 683                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 684                 __skb_push(skb, hlen);
 685                 skb_reset_network_header(skb);
 686                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 687
 688                 ipv6_select_ident(skb, fh);
 689                 fh->nexthdr = nexthdr;
 690                 fh->reserved = 0;
 691                 fh->frag_off = htons(IP6_MF);
 692                 frag_id = fh->identification;
 693
 694                 first_len = skb_pagelen(skb);
 695                 skb->data_len = first_len - skb_headlen(skb);
 696                 skb->truesize -= truesizes;
 697                 skb->len = first_len;
 698                 ipv6_hdr(skb)->payload_len = htons(first_len -
 699                                                    sizeof(struct ipv6hdr));
 700
 701                 dst_hold(&rt->u.dst);
 702
 703                 for (;;) {
 704                         /* Prepare header of the next frame,
 705                          * before previous one went down. */
 706                         if (frag) {
 707                                 frag->ip_summed = CHECKSUM_NONE;
 708                                 skb_reset_transport_header(frag);
 709                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 710                                 __skb_push(frag, hlen);
 711                                 skb_reset_network_header(frag);
 712                                 memcpy(skb_network_header(frag), tmp_hdr,
 713                                        hlen);
 714                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 715                                 fh->nexthdr = nexthdr;
 716                                 fh->reserved = 0;
 717                                 fh->frag_off = htons(offset);
 718                                 if (frag->next != NULL)
 719                                         fh->frag_off |= htons(IP6_MF);
 720                                 fh->identification = frag_id;
 721                                 ipv6_hdr(frag)->payload_len =
 722                                                 htons(frag->len -
 723                                                       sizeof(struct ipv6hdr));
 724                                 ip6_copy_metadata(frag, skb);
 725                         }
 726
 727                         err = output(skb);
 728                         if(!err)
 729                                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
 730
 731                         if (err || !frag)
 732                                 break;
 733
 734                         skb = frag;
 735                         frag = skb->next;
 736                         skb->next = NULL;
 737                 }
 738
 739                 kfree(tmp_hdr);
 740
 741                 if (err == 0) {
 742                         IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
 743                         dst_release(&rt->u.dst);
 744                         return 0;
 745                 }
 746
 747                 while (frag) {
 748                         skb = frag->next;
 749                         kfree_skb(frag);
 750                         frag = skb;
 751                 }
 752
 753                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
 754                 dst_release(&rt->u.dst);
 755                 return err;
 756         }
 757
 758 slow_path:
 759         left = skb->len - hlen;         /* Space per frame */
 760         ptr = hlen;                     /* Where to start from */
 761
 762         /*
 763          *      Fragment the datagram.
 764          */
 765
 766         *prevhdr = NEXTHDR_FRAGMENT;
 767
 768         /*
 769          *      Keep copying data until we run out.
 770          */
 771         while(left > 0) {
 772                 len = left;
 773                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 774                 if (len > mtu)
 775                         len = mtu;
 776                 /* IF: we are not sending upto and including the packet end
 777                    then align the next start on an eight byte boundary */
 778                 if (len < left) {
 779                         len &= ~7;
 780                 }
 781                 /*
 782                  *      Allocate buffer.
 783                  */
 784
 785                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 786                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 787                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 788                                       IPSTATS_MIB_FRAGFAILS);
 789                         err = -ENOMEM;
 790                         goto fail;
 791                 }
 792
 793                 /*
 794                  *      Set up data on packet
 795                  */
 796
 797                 ip6_copy_metadata(frag, skb);
 798                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 799                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 800                 skb_reset_network_header(frag);
 801                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 802                 frag->transport_header = (frag->network_header + hlen +
 803                                           sizeof(struct frag_hdr));
 804
 805                 /*
 806                  *      Charge the memory for the fragment to any owner
 807                  *      it might possess
 808                  */
 809                 if (skb->sk)
 810                         skb_set_owner_w(frag, skb->sk);
 811
 812                 /*
 813                  *      Copy the packet header into the new buffer.
 814                  */
 815                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 816
 817                 /*
 818                  *      Build fragment header.
 819                  */
 820                 fh->nexthdr = nexthdr;
 821                 fh->reserved = 0;
 822                 if (!frag_id) {
 823                         ipv6_select_ident(skb, fh);
 824                         frag_id = fh->identification;
 825                 } else
 826                         fh->identification = frag_id;
 827
 828                 /*
 829                  *      Copy a block of the IP datagram.
 830                  */
 831                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 832                         BUG();
 833                 left -= len;
 834
 835                 fh->frag_off = htons(offset);
 836                 if (left > 0)
 837                         fh->frag_off |= htons(IP6_MF);
 838                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 839                                                     sizeof(struct ipv6hdr));
 840
 841                 ptr += len;
 842                 offset += len;
 843
 844                 /*
 845                  *      Put this fragment into the sending queue.
 846                  */
 847                 err = output(frag);
 848                 if (err)
 849                         goto fail;
 850
 851                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
 852         }
 853         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 854                       IPSTATS_MIB_FRAGOKS);
 855         kfree_skb(skb);
 856         return err;
 857
 858 fail:
 859         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 860                       IPSTATS_MIB_FRAGFAILS);
 861         kfree_skb(skb);
 862         return err;
 863 }
 864
 865 static inline int ip6_rt_check(struct rt6key *rt_key,
 866                                struct in6_addr *fl_addr,
 867                                struct in6_addr *addr_cache)
 868 {
 869         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 870                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 871 }
 872
 873 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 874                                           struct dst_entry *dst,
 875                                           struct flowi *fl)
 876 {
 877         struct ipv6_pinfo *np = inet6_sk(sk);
 878         struct rt6_info *rt = (struct rt6_info *)dst;
 879
 880         if (!dst)
 881                 goto out;
 882
 883         /* Yes, checking route validity in not connected
 884          * case is not very simple. Take into account,
 885          * that we do not support routing by source, TOS,
 886          * and MSG_DONTROUTE            --ANK (980726)
 887          *
 888          * 1. ip6_rt_check(): If route was host route,
 889          *    check that cached destination is current.
 890          *    If it is network route, we still may
 891          *    check its validity using saved pointer
 892          *    to the last used address: daddr_cache.
 893          *    We do not want to save whole address now,
 894          *    (because main consumer of this service
 895          *    is tcp, which has not this problem),
 896          *    so that the last trick works only on connected
 897          *    sockets.
 898          * 2. oif also should be the same.
 899          */
 900         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 901 #ifdef CONFIG_IPV6_SUBTREES
 902             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 903 #endif
 904             (fl->oif && fl->oif != dst->dev->ifindex)) {
 905                 dst_release(dst);
 906                 dst = NULL;
 907         }
 908
 909 out:
 910         return dst;
 911 }
 912
 913 static int ip6_dst_lookup_tail(struct sock *sk,
 914                                struct dst_entry **dst, struct flowi *fl)
 915 {
 916         int err;
 917         struct net *net = sock_net(sk);
 918
 919         if (*dst == NULL)
 920                 *dst = ip6_route_output(net, sk, fl);
 921
 922         if ((err = (*dst)->error))
 923                 goto out_err_release;
 924
 925         if (ipv6_addr_any(&fl->fl6_src)) {
 926                 err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
 927                                          &fl->fl6_dst,
 928                                          sk ? inet6_sk(sk)->srcprefs : 0,
 929                                          &fl->fl6_src);
 930                 if (err)
 931                         goto out_err_release;
 932         }
 933
 934 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 935                 /*
 936                  * Here if the dst entry we've looked up
 937                  * has a neighbour entry that is in the INCOMPLETE
 938                  * state and the src address from the flow is
 939                  * marked as OPTIMISTIC, we release the found
 940                  * dst entry and replace it instead with the
 941                  * dst entry of the nexthop router
 942                  */
 943                 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
 944                         struct inet6_ifaddr *ifp;
 945                         struct flowi fl_gw;
 946                         int redirect;
 947
 948                         ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 949                                               (*dst)->dev, 1);
 950
 951                         redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 952                         if (ifp)
 953                                 in6_ifa_put(ifp);
 954
 955                         if (redirect) {
 956                                 /*
 957                                  * We need to get the dst entry for the
 958                                  * default router instead
 959                                  */
 960                                 dst_release(*dst);
 961                                 memcpy(&fl_gw, fl, sizeof(struct flowi));
 962                                 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 963                                 *dst = ip6_route_output(net, sk, &fl_gw);
 964                                 if ((err = (*dst)->error))
 965                                         goto out_err_release;
 966                         }
 967                 }
 968 #endif
 969
 970         return 0;
 971
 972 out_err_release:
 973         if (err == -ENETUNREACH)
 974                 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
 975         dst_release(*dst);
 976         *dst = NULL;
 977         return err;
 978 }
 979
 980 /**
 981  *      ip6_dst_lookup - perform route lookup on flow
 982  *      @sk: socket which provides route info
 983  *      @dst: pointer to dst_entry * for result
 984  *      @fl: flow to lookup
 985  *
 986  *      This function performs a route lookup on the given flow.
 987  *
 988  *      It returns zero on success, or a standard errno code on error.
 989  */
 990 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 991 {
 992         *dst = NULL;
 993         return ip6_dst_lookup_tail(sk, dst, fl);
 994 }
 995 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 996
 997 /**
 998  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
 999  *      @sk: socket which provides the dst cache and route info
1000  *      @dst: pointer to dst_entry * for result
1001  *      @fl: flow to lookup
1002  *
1003  *      This function performs a route lookup on the given flow with the
1004  *      possibility of using the cached route in the socket if it is valid.
1005  *      It will take the socket dst lock when operating on the dst cache.
1006  *      As a result, this function can only be used in process context.
1007  *
1008  *      It returns zero on success, or a standard errno code on error.
1009  */
1010 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1011 {
1012         *dst = NULL;
1013         if (sk) {
1014                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1015                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1016         }
1017
1018         return ip6_dst_lookup_tail(sk, dst, fl);
1019 }
1020 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1021
1022 static inline int ip6_ufo_append_data(struct sock *sk,
1023                         int getfrag(void *from, char *to, int offset, int len,
1024                         int odd, struct sk_buff *skb),
1025                         void *from, int length, int hh_len, int fragheaderlen,
1026                         int transhdrlen, int mtu,unsigned int flags)
1027
1028 {
1029         struct sk_buff *skb;
1030         int err;
1031
1032         /* There is support for UDP large send offload by network
1033          * device, so create one single skb packet containing complete
1034          * udp datagram
1035          */
1036         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1037                 skb = sock_alloc_send_skb(sk,
1038                         hh_len + fragheaderlen + transhdrlen + 20,
1039                         (flags & MSG_DONTWAIT), &err);
1040                 if (skb == NULL)
1041                         return -ENOMEM;
1042
1043                 /* reserve space for Hardware header */
1044                 skb_reserve(skb, hh_len);
1045
1046                 /* create space for UDP/IP header */
1047                 skb_put(skb,fragheaderlen + transhdrlen);
1048
1049                 /* initialize network header pointer */
1050                 skb_reset_network_header(skb);
1051
1052                 /* initialize protocol header pointer */
1053                 skb->transport_header = skb->network_header + fragheaderlen;
1054
1055                 skb->ip_summed = CHECKSUM_PARTIAL;
1056                 skb->csum = 0;
1057                 sk->sk_sndmsg_off = 0;
1058         }
1059
1060         err = skb_append_datato_frags(sk,skb, getfrag, from,
1061                                       (length - transhdrlen));
1062         if (!err) {
1063                 struct frag_hdr fhdr;
1064
1065                 /* specify the length of each IP datagram fragment*/
1066                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1067                                             sizeof(struct frag_hdr);
1068                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1069                 ipv6_select_ident(skb, &fhdr);
1070                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1071                 __skb_queue_tail(&sk->sk_write_queue, skb);
1072
1073                 return 0;
1074         }
1075         /* There is not enough support do UPD LSO,
1076          * so follow normal path
1077          */
1078         kfree_skb(skb);
1079
1080         return err;
1081 }
1082
1083 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1084         int offset, int len, int odd, struct sk_buff *skb),
1085         void *from, int length, int transhdrlen,
1086         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1087         struct rt6_info *rt, unsigned int flags)
1088 {
1089         struct inet_sock *inet = inet_sk(sk);
1090         struct ipv6_pinfo *np = inet6_sk(sk);
1091         struct sk_buff *skb;
1092         unsigned int maxfraglen, fragheaderlen;
1093         int exthdrlen;
1094         int hh_len;
1095         int mtu;
1096         int copy;
1097         int err;
1098         int offset = 0;
1099         int csummode = CHECKSUM_NONE;
1100
1101         if (flags&MSG_PROBE)
1102                 return 0;
1103         if (skb_queue_empty(&sk->sk_write_queue)) {
1104                 /*
1105                  * setup for corking
1106                  */
1107                 if (opt) {
1108                         if (np->cork.opt == NULL) {
1109                                 np->cork.opt = kmalloc(opt->tot_len,
1110                                                        sk->sk_allocation);
1111                                 if (unlikely(np->cork.opt == NULL))
1112                                         return -ENOBUFS;
1113                         } else if (np->cork.opt->tot_len < opt->tot_len) {
1114                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1115                                 return -EINVAL;
1116                         }
1117                         memcpy(np->cork.opt, opt, opt->tot_len);
1118                         inet->cork.flags |= IPCORK_OPT;
1119                         /* need source address above miyazawa*/
1120                 }
1121                 dst_hold(&rt->u.dst);
1122                 inet->cork.dst = &rt->u.dst;
1123                 inet->cork.fl = *fl;
1124                 np->cork.hop_limit = hlimit;
1125                 np->cork.tclass = tclass;
1126                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1127                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1128                 if (np->frag_size < mtu) {
1129                         if (np->frag_size)
1130                                 mtu = np->frag_size;
1131                 }
1132                 inet->cork.fragsize = mtu;
1133                 if (dst_allfrag(rt->u.dst.path))
1134                         inet->cork.flags |= IPCORK_ALLFRAG;
1135                 inet->cork.length = 0;
1136                 sk->sk_sndmsg_page = NULL;
1137                 sk->sk_sndmsg_off = 0;
1138                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1139                             rt->rt6i_nfheader_len;
1140                 length += exthdrlen;
1141                 transhdrlen += exthdrlen;
1142         } else {
1143                 rt = (struct rt6_info *)inet->cork.dst;
1144                 fl = &inet->cork.fl;
1145                 if (inet->cork.flags & IPCORK_OPT)
1146                         opt = np->cork.opt;
1147                 transhdrlen = 0;
1148                 exthdrlen = 0;
1149                 mtu = inet->cork.fragsize;
1150         }
1151
1152         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1153
1154         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1155                         (opt ? opt->opt_nflen : 0);
1156         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1157
1158         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1159                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1160                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1161                         return -EMSGSIZE;
1162                 }
1163         }
1164
1165         /*
1166          * Let's try using as much space as possible.
1167          * Use MTU if total length of the message fits into the MTU.
1168          * Otherwise, we need to reserve fragment header and
1169          * fragment alignment (= 8-15 octects, in total).
1170          *
1171          * Note that we may need to "move" the data from the tail of
1172          * of the buffer to the new fragment when we split
1173          * the message.
1174          *
1175          * FIXME: It may be fragmented into multiple chunks
1176          *        at once if non-fragmentable extension headers
1177          *        are too large.
1178          * --yoshfuji
1179          */
1180
1181         inet->cork.length += length;
1182         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1183             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1184
1185                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1186                                           fragheaderlen, transhdrlen, mtu,
1187                                           flags);
1188                 if (err)
1189                         goto error;
1190                 return 0;
1191         }
1192
1193         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1194                 goto alloc_new_skb;
1195
1196         while (length > 0) {
1197                 /* Check if the remaining data fits into current packet. */
1198                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1199                 if (copy < length)
1200                         copy = maxfraglen - skb->len;
1201
1202                 if (copy <= 0) {
1203                         char *data;
1204                         unsigned int datalen;
1205                         unsigned int fraglen;
1206                         unsigned int fraggap;
1207                         unsigned int alloclen;
1208                         struct sk_buff *skb_prev;
1209 alloc_new_skb:
1210                         skb_prev = skb;
1211
1212                         /* There's no room in the current skb */
1213                         if (skb_prev)
1214                                 fraggap = skb_prev->len - maxfraglen;
1215                         else
1216                                 fraggap = 0;
1217
1218                         /*
1219                          * If remaining data exceeds the mtu,
1220                          * we know we need more fragment(s).
1221                          */
1222                         datalen = length + fraggap;
1223                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1224                                 datalen = maxfraglen - fragheaderlen;
1225
1226                         fraglen = datalen + fragheaderlen;
1227                         if ((flags & MSG_MORE) &&
1228                             !(rt->u.dst.dev->features&NETIF_F_SG))
1229                                 alloclen = mtu;
1230                         else
1231                                 alloclen = datalen + fragheaderlen;
1232
1233                         /*
1234                          * The last fragment gets additional space at tail.
1235                          * Note: we overallocate on fragments with MSG_MODE
1236                          * because we have no idea if we're the last one.
1237                          */
1238                         if (datalen == length + fraggap)
1239                                 alloclen += rt->u.dst.trailer_len;
1240
1241                         /*
1242                          * We just reserve space for fragment header.
1243                          * Note: this may be overallocation if the message
1244                          * (without MSG_MORE) fits into the MTU.
1245                          */
1246                         alloclen += sizeof(struct frag_hdr);
1247
1248                         if (transhdrlen) {
1249                                 skb = sock_alloc_send_skb(sk,
1250                                                 alloclen + hh_len,
1251                                                 (flags & MSG_DONTWAIT), &err);
1252                         } else {
1253                                 skb = NULL;
1254                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1255                                     2 * sk->sk_sndbuf)
1256                                         skb = sock_wmalloc(sk,
1257                                                            alloclen + hh_len, 1,
1258                                                            sk->sk_allocation);
1259                                 if (unlikely(skb == NULL))
1260                                         err = -ENOBUFS;
1261                         }
1262                         if (skb == NULL)
1263                                 goto error;
1264                         /*
1265                          *      Fill in the control structures
1266                          */
1267                         skb->ip_summed = csummode;
1268                         skb->csum = 0;
1269                         /* reserve for fragmentation */
1270                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1271
1272                         /*
1273                          *      Find where to start putting bytes
1274                          */
1275                         data = skb_put(skb, fraglen);
1276                         skb_set_network_header(skb, exthdrlen);
1277                         data += fragheaderlen;
1278                         skb->transport_header = (skb->network_header +
1279                                                  fragheaderlen);
1280                         if (fraggap) {
1281                                 skb->csum = skb_copy_and_csum_bits(
1282                                         skb_prev, maxfraglen,
1283                                         data + transhdrlen, fraggap, 0);
1284                                 skb_prev->csum = csum_sub(skb_prev->csum,
1285                                                           skb->csum);
1286                                 data += fraggap;
1287                                 pskb_trim_unique(skb_prev, maxfraglen);
1288                         }
1289                         copy = datalen - transhdrlen - fraggap;
1290                         if (copy < 0) {
1291                                 err = -EINVAL;
1292                                 kfree_skb(skb);
1293                                 goto error;
1294                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1295                                 err = -EFAULT;
1296                                 kfree_skb(skb);
1297                                 goto error;
1298                         }
1299
1300                         offset += copy;
1301                         length -= datalen - fraggap;
1302                         transhdrlen = 0;
1303                         exthdrlen = 0;
1304                         csummode = CHECKSUM_NONE;
1305
1306                         /*
1307                          * Put the packet on the pending queue
1308                          */
1309                         __skb_queue_tail(&sk->sk_write_queue, skb);
1310                         continue;
1311                 }
1312
1313                 if (copy > length)
1314                         copy = length;
1315
1316                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1317                         unsigned int off;
1318
1319                         off = skb->len;
1320                         if (getfrag(from, skb_put(skb, copy),
1321                                                 offset, copy, off, skb) < 0) {
1322                                 __skb_trim(skb, off);
1323                                 err = -EFAULT;
1324                                 goto error;
1325                         }
1326                 } else {
1327                         int i = skb_shinfo(skb)->nr_frags;
1328                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1329                         struct page *page = sk->sk_sndmsg_page;
1330                         int off = sk->sk_sndmsg_off;
1331                         unsigned int left;
1332
1333                         if (page && (left = PAGE_SIZE - off) > 0) {
1334                                 if (copy >= left)
1335                                         copy = left;
1336                                 if (page != frag->page) {
1337                                         if (i == MAX_SKB_FRAGS) {
1338                                                 err = -EMSGSIZE;
1339                                                 goto error;
1340                                         }
1341                                         get_page(page);
1342                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1343                                         frag = &skb_shinfo(skb)->frags[i];
1344                                 }
1345                         } else if(i < MAX_SKB_FRAGS) {
1346                                 if (copy > PAGE_SIZE)
1347                                         copy = PAGE_SIZE;
1348                                 page = alloc_pages(sk->sk_allocation, 0);
1349                                 if (page == NULL) {
1350                                         err = -ENOMEM;
1351                                         goto error;
1352                                 }
1353                                 sk->sk_sndmsg_page = page;
1354                                 sk->sk_sndmsg_off = 0;
1355
1356                                 skb_fill_page_desc(skb, i, page, 0, 0);
1357                                 frag = &skb_shinfo(skb)->frags[i];
1358                         } else {
1359                                 err = -EMSGSIZE;
1360                                 goto error;
1361                         }
1362                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1363                                 err = -EFAULT;
1364                                 goto error;
1365                         }
1366                         sk->sk_sndmsg_off += copy;
1367                         frag->size += copy;
1368                         skb->len += copy;
1369                         skb->data_len += copy;
1370                         skb->truesize += copy;
1371                         atomic_add(copy, &sk->sk_wmem_alloc);
1372                 }
1373                 offset += copy;
1374                 length -= copy;
1375         }
1376         return 0;
1377 error:
1378         inet->cork.length -= length;
1379         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1380         return err;
1381 }
1382
1383 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1384 {
1385         inet->cork.flags &= ~IPCORK_OPT;
1386         kfree(np->cork.opt);
1387         np->cork.opt = NULL;
1388         if (inet->cork.dst) {
1389                 dst_release(inet->cork.dst);
1390                 inet->cork.dst = NULL;
1391                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1392         }
1393         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1394 }
1395
1396 int ip6_push_pending_frames(struct sock *sk)
1397 {
1398         struct sk_buff *skb, *tmp_skb;
1399         struct sk_buff **tail_skb;
1400         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1401         struct inet_sock *inet = inet_sk(sk);
1402         struct ipv6_pinfo *np = inet6_sk(sk);
1403         struct ipv6hdr *hdr;
1404         struct ipv6_txoptions *opt = np->cork.opt;
1405         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1406         struct flowi *fl = &inet->cork.fl;
1407         unsigned char proto = fl->proto;
1408         int err = 0;
1409
1410         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1411                 goto out;
1412         tail_skb = &(skb_shinfo(skb)->frag_list);
1413
1414         /* move skb->data to ip header from ext header */
1415         if (skb->data < skb_network_header(skb))
1416                 __skb_pull(skb, skb_network_offset(skb));
1417         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1418                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1419                 *tail_skb = tmp_skb;
1420                 tail_skb = &(tmp_skb->next);
1421                 skb->len += tmp_skb->len;
1422                 skb->data_len += tmp_skb->len;
1423                 skb->truesize += tmp_skb->truesize;
1424                 __sock_put(tmp_skb->sk);
1425                 tmp_skb->destructor = NULL;
1426                 tmp_skb->sk = NULL;
1427         }
1428
1429         /* Allow local fragmentation. */
1430         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1431                 skb->local_df = 1;
1432
1433         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1434         __skb_pull(skb, skb_network_header_len(skb));
1435         if (opt && opt->opt_flen)
1436                 ipv6_push_frag_opts(skb, opt, &proto);
1437         if (opt && opt->opt_nflen)
1438                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1439
1440         skb_push(skb, sizeof(struct ipv6hdr));
1441         skb_reset_network_header(skb);
1442         hdr = ipv6_hdr(skb);
1443
1444         *(__be32*)hdr = fl->fl6_flowlabel |
1445                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1446
1447         hdr->hop_limit = np->cork.hop_limit;
1448         hdr->nexthdr = proto;
1449         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1450         ipv6_addr_copy(&hdr->daddr, final_dst);
1451
1452         skb->priority = sk->sk_priority;
1453         skb->mark = sk->sk_mark;
1454
1455         skb->dst = dst_clone(&rt->u.dst);
1456         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1457         if (proto == IPPROTO_ICMPV6) {
1458                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1459
1460                 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1461                 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1462         }
1463
1464         err = ip6_local_out(skb);
1465         if (err) {
1466                 if (err > 0)
1467                         err = np->recverr ? net_xmit_errno(err) : 0;
1468                 if (err)
1469                         goto error;
1470         }
1471
1472 out:
1473         ip6_cork_release(inet, np);
1474         return err;
1475 error:
1476         goto out;
1477 }
1478
1479 void ip6_flush_pending_frames(struct sock *sk)
1480 {
1481         struct sk_buff *skb;
1482
1483         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1484                 if (skb->dst)
1485                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
1486                                       IPSTATS_MIB_OUTDISCARDS);
1487                 kfree_skb(skb);
1488         }
1489
1490         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1491 }