2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 int __ip6_local_out(struct sk_buff *skb)
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
68 ipv6_hdr(skb)->payload_len = htons(len);
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
74 int ip6_local_out(struct sk_buff *skb)
78 err = __ip6_local_out(skb);
80 err = dst_output(skb);
84 EXPORT_SYMBOL_GPL(ip6_local_out);
86 static int ip6_finish_output2(struct sk_buff *skb)
88 struct dst_entry *dst = skb_dst(skb);
89 struct net_device *dev = dst->dev;
90 struct neighbour *neigh;
93 skb->protocol = htons(ETH_P_IPV6);
96 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
97 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
99 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
100 ((mroute6_socket(dev_net(dev), skb) &&
101 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
102 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
103 &ipv6_hdr(skb)->saddr))) {
104 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
106 /* Do not check for IFF_ALLMULTI; multicast routing
107 is not supported in any case.
110 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
111 newskb, NULL, newskb->dev,
114 if (ipv6_hdr(skb)->hop_limit == 0) {
115 IP6_INC_STATS(dev_net(dev), idev,
116 IPSTATS_MIB_OUTDISCARDS);
122 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
126 rt = (struct rt6_info *) dst;
129 return dst_neigh_output(dst, neigh, skb);
131 IP6_INC_STATS_BH(dev_net(dst->dev),
132 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
137 static int ip6_finish_output(struct sk_buff *skb)
139 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
140 dst_allfrag(skb_dst(skb)))
141 return ip6_fragment(skb, ip6_finish_output2);
143 return ip6_finish_output2(skb);
146 int ip6_output(struct sk_buff *skb)
148 struct net_device *dev = skb_dst(skb)->dev;
149 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
150 if (unlikely(idev->cnf.disable_ipv6)) {
151 IP6_INC_STATS(dev_net(dev), idev,
152 IPSTATS_MIB_OUTDISCARDS);
157 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
159 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
163 * xmit an sk_buff (used by TCP, SCTP and DCCP)
166 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
167 struct ipv6_txoptions *opt, int tclass)
169 struct net *net = sock_net(sk);
170 struct ipv6_pinfo *np = inet6_sk(sk);
171 struct in6_addr *first_hop = &fl6->daddr;
172 struct dst_entry *dst = skb_dst(skb);
174 u8 proto = fl6->flowi6_proto;
175 int seg_len = skb->len;
180 unsigned int head_room;
182 /* First: exthdrs may take lots of space (~8K for now)
183 MAX_HEADER is not enough.
185 head_room = opt->opt_nflen + opt->opt_flen;
186 seg_len += head_room;
187 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
189 if (skb_headroom(skb) < head_room) {
190 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
192 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
193 IPSTATS_MIB_OUTDISCARDS);
199 skb_set_owner_w(skb, sk);
202 ipv6_push_frag_opts(skb, opt, &proto);
204 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
207 skb_push(skb, sizeof(struct ipv6hdr));
208 skb_reset_network_header(skb);
212 * Fill in the IPv6 header
215 hlimit = np->hop_limit;
217 hlimit = ip6_dst_hoplimit(dst);
219 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
221 hdr->payload_len = htons(seg_len);
222 hdr->nexthdr = proto;
223 hdr->hop_limit = hlimit;
225 hdr->saddr = fl6->saddr;
226 hdr->daddr = *first_hop;
228 skb->priority = sk->sk_priority;
229 skb->mark = sk->sk_mark;
232 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
233 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
234 IPSTATS_MIB_OUT, skb->len);
235 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
236 dst->dev, dst_output);
239 net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
241 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
242 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
247 EXPORT_SYMBOL(ip6_xmit);
250 * To avoid extra problems ND packets are send through this
251 * routine. It's code duplication but I really want to avoid
252 * extra checks since ipv6_build_header is used by TCP (which
253 * is for us performance critical)
256 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
257 const struct in6_addr *saddr, const struct in6_addr *daddr,
260 struct ipv6_pinfo *np = inet6_sk(sk);
263 skb->protocol = htons(ETH_P_IPV6);
266 skb_reset_network_header(skb);
267 skb_put(skb, sizeof(struct ipv6hdr));
270 *(__be32*)hdr = htonl(0x60000000);
272 hdr->payload_len = htons(len);
273 hdr->nexthdr = proto;
274 hdr->hop_limit = np->hop_limit;
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
284 struct ip6_ra_chain *ra;
285 struct sock *last = NULL;
287 read_lock(&ip6_ra_lock);
288 for (ra = ip6_ra_chain; ra; ra = ra->next) {
289 struct sock *sk = ra->sk;
290 if (sk && ra->sel == sel &&
291 (!sk->sk_bound_dev_if ||
292 sk->sk_bound_dev_if == skb->dev->ifindex)) {
294 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
296 rawv6_rcv(last, skb2);
303 rawv6_rcv(last, skb);
304 read_unlock(&ip6_ra_lock);
307 read_unlock(&ip6_ra_lock);
311 static int ip6_forward_proxy_check(struct sk_buff *skb)
313 struct ipv6hdr *hdr = ipv6_hdr(skb);
314 u8 nexthdr = hdr->nexthdr;
318 if (ipv6_ext_hdr(nexthdr)) {
319 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
323 offset = sizeof(struct ipv6hdr);
325 if (nexthdr == IPPROTO_ICMPV6) {
326 struct icmp6hdr *icmp6;
328 if (!pskb_may_pull(skb, (skb_network_header(skb) +
329 offset + 1 - skb->data)))
332 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
334 switch (icmp6->icmp6_type) {
335 case NDISC_ROUTER_SOLICITATION:
336 case NDISC_ROUTER_ADVERTISEMENT:
337 case NDISC_NEIGHBOUR_SOLICITATION:
338 case NDISC_NEIGHBOUR_ADVERTISEMENT:
340 /* For reaction involving unicast neighbor discovery
341 * message destined to the proxied address, pass it to
351 * The proxying router can't forward traffic sent to a link-local
352 * address, so signal the sender and discard the packet. This
353 * behavior is clarified by the MIPv6 specification.
355 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
356 dst_link_failure(skb);
363 static inline int ip6_forward_finish(struct sk_buff *skb)
365 return dst_output(skb);
368 int ip6_forward(struct sk_buff *skb)
370 struct dst_entry *dst = skb_dst(skb);
371 struct ipv6hdr *hdr = ipv6_hdr(skb);
372 struct inet6_skb_parm *opt = IP6CB(skb);
373 struct net *net = dev_net(dst->dev);
376 if (net->ipv6.devconf_all->forwarding == 0)
379 if (skb_warn_if_lro(skb))
382 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
387 if (skb->pkt_type != PACKET_HOST)
390 skb_forward_csum(skb);
393 * We DO NOT make any processing on
394 * RA packets, pushing them to user level AS IS
395 * without ane WARRANTY that application will be able
396 * to interpret them. The reason is that we
397 * cannot make anything clever here.
399 * We are not end-node, so that if packet contains
400 * AH/ESP, we cannot make anything.
401 * Defragmentation also would be mistake, RA packets
402 * cannot be fragmented, because there is no warranty
403 * that different fragments will go along one path. --ANK
406 u8 *ptr = skb_network_header(skb) + opt->ra;
407 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
412 * check and decrement ttl
414 if (hdr->hop_limit <= 1) {
415 /* Force OUTPUT device used as source address */
417 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418 IP6_INC_STATS_BH(net,
419 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
425 /* XXX: idev->cnf.proxy_ndp? */
426 if (net->ipv6.devconf_all->proxy_ndp &&
427 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428 int proxied = ip6_forward_proxy_check(skb);
430 return ip6_input(skb);
431 else if (proxied < 0) {
432 IP6_INC_STATS(net, ip6_dst_idev(dst),
433 IPSTATS_MIB_INDISCARDS);
438 if (!xfrm6_route_forward(skb)) {
439 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
444 /* IPv6 specs say nothing about it, but it is clear that we cannot
445 send redirects to source routed frames.
446 We don't send redirects to frames decapsulated from IPsec.
448 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
449 struct in6_addr *target = NULL;
450 struct inet_peer *peer;
454 * incoming and outgoing devices are the same
458 rt = (struct rt6_info *) dst;
459 if (rt->rt6i_flags & RTF_GATEWAY)
460 target = &rt->rt6i_gateway;
462 target = &hdr->daddr;
464 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
466 /* Limit redirects both by destination (here)
467 and by source (inside ndisc_send_redirect)
469 if (inet_peer_xrlim_allow(peer, 1*HZ))
470 ndisc_send_redirect(skb, target);
474 int addrtype = ipv6_addr_type(&hdr->saddr);
476 /* This check is security critical. */
477 if (addrtype == IPV6_ADDR_ANY ||
478 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
480 if (addrtype & IPV6_ADDR_LINKLOCAL) {
481 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
482 ICMPV6_NOT_NEIGHBOUR, 0);
488 if (mtu < IPV6_MIN_MTU)
491 if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
492 (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
493 /* Again, force OUTPUT device used as source address */
495 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496 IP6_INC_STATS_BH(net,
497 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
498 IP6_INC_STATS_BH(net,
499 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
504 if (skb_cow(skb, dst->dev->hard_header_len)) {
505 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
511 /* Mangling hops number delayed to point after skb COW */
515 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
516 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
517 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
521 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
527 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
529 to->pkt_type = from->pkt_type;
530 to->priority = from->priority;
531 to->protocol = from->protocol;
533 skb_dst_set(to, dst_clone(skb_dst(from)));
535 to->mark = from->mark;
537 #ifdef CONFIG_NET_SCHED
538 to->tc_index = from->tc_index;
541 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
542 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
543 to->nf_trace = from->nf_trace;
545 skb_copy_secmark(to, from);
548 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
550 u16 offset = sizeof(struct ipv6hdr);
551 struct ipv6_opt_hdr *exthdr =
552 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
553 unsigned int packet_len = skb->tail - skb->network_header;
555 *nexthdr = &ipv6_hdr(skb)->nexthdr;
557 while (offset + 1 <= packet_len) {
563 case NEXTHDR_ROUTING:
567 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
568 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
578 offset += ipv6_optlen(exthdr);
579 *nexthdr = &exthdr->nexthdr;
580 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
587 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
589 static atomic_t ipv6_fragmentation_id;
592 if (rt && !(rt->dst.flags & DST_NOPEER)) {
593 struct inet_peer *peer;
596 net = dev_net(rt->dst.dev);
597 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
599 fhdr->identification = htonl(inet_getid(peer, 0));
605 old = atomic_read(&ipv6_fragmentation_id);
609 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
610 fhdr->identification = htonl(new);
613 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
615 struct sk_buff *frag;
616 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
617 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
618 struct ipv6hdr *tmp_hdr;
620 unsigned int mtu, hlen, left, len;
623 int ptr, offset = 0, err=0;
624 u8 *prevhdr, nexthdr = 0;
625 struct net *net = dev_net(skb_dst(skb)->dev);
627 hlen = ip6_find_1stfragopt(skb, &prevhdr);
630 mtu = ip6_skb_dst_mtu(skb);
632 /* We must not fragment if the socket is set to force MTU discovery
633 * or if the skb it not generated by a local socket.
635 if (unlikely(!skb->local_df && skb->len > mtu) ||
636 (IP6CB(skb)->frag_max_size &&
637 IP6CB(skb)->frag_max_size > mtu)) {
638 if (skb->sk && dst_allfrag(skb_dst(skb)))
639 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
641 skb->dev = skb_dst(skb)->dev;
642 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
643 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
644 IPSTATS_MIB_FRAGFAILS);
649 if (np && np->frag_size < mtu) {
653 mtu -= hlen + sizeof(struct frag_hdr);
655 if (skb_has_frag_list(skb)) {
656 int first_len = skb_pagelen(skb);
657 struct sk_buff *frag2;
659 if (first_len - hlen > mtu ||
660 ((first_len - hlen) & 7) ||
664 skb_walk_frags(skb, frag) {
665 /* Correct geometry. */
666 if (frag->len > mtu ||
667 ((frag->len & 7) && frag->next) ||
668 skb_headroom(frag) < hlen)
669 goto slow_path_clean;
671 /* Partially cloned skb? */
672 if (skb_shared(frag))
673 goto slow_path_clean;
678 frag->destructor = sock_wfree;
680 skb->truesize -= frag->truesize;
685 frag = skb_shinfo(skb)->frag_list;
686 skb_frag_list_init(skb);
689 *prevhdr = NEXTHDR_FRAGMENT;
690 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
693 IPSTATS_MIB_FRAGFAILS);
697 __skb_pull(skb, hlen);
698 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
699 __skb_push(skb, hlen);
700 skb_reset_network_header(skb);
701 memcpy(skb_network_header(skb), tmp_hdr, hlen);
703 ipv6_select_ident(fh, rt);
704 fh->nexthdr = nexthdr;
706 fh->frag_off = htons(IP6_MF);
707 frag_id = fh->identification;
709 first_len = skb_pagelen(skb);
710 skb->data_len = first_len - skb_headlen(skb);
711 skb->len = first_len;
712 ipv6_hdr(skb)->payload_len = htons(first_len -
713 sizeof(struct ipv6hdr));
718 /* Prepare header of the next frame,
719 * before previous one went down. */
721 frag->ip_summed = CHECKSUM_NONE;
722 skb_reset_transport_header(frag);
723 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
724 __skb_push(frag, hlen);
725 skb_reset_network_header(frag);
726 memcpy(skb_network_header(frag), tmp_hdr,
728 offset += skb->len - hlen - sizeof(struct frag_hdr);
729 fh->nexthdr = nexthdr;
731 fh->frag_off = htons(offset);
732 if (frag->next != NULL)
733 fh->frag_off |= htons(IP6_MF);
734 fh->identification = frag_id;
735 ipv6_hdr(frag)->payload_len =
737 sizeof(struct ipv6hdr));
738 ip6_copy_metadata(frag, skb);
743 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
744 IPSTATS_MIB_FRAGCREATES);
757 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
758 IPSTATS_MIB_FRAGOKS);
759 dst_release(&rt->dst);
769 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
770 IPSTATS_MIB_FRAGFAILS);
771 dst_release(&rt->dst);
775 skb_walk_frags(skb, frag2) {
779 frag2->destructor = NULL;
780 skb->truesize += frag2->truesize;
785 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
786 skb_checksum_help(skb))
789 left = skb->len - hlen; /* Space per frame */
790 ptr = hlen; /* Where to start from */
793 * Fragment the datagram.
796 *prevhdr = NEXTHDR_FRAGMENT;
797 hroom = LL_RESERVED_SPACE(rt->dst.dev);
798 troom = rt->dst.dev->needed_tailroom;
801 * Keep copying data until we run out.
805 /* IF: it doesn't fit, use 'mtu' - the data space left */
808 /* IF: we are not sending up to and including the packet end
809 then align the next start on an eight byte boundary */
817 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
818 hroom + troom, GFP_ATOMIC)) == NULL) {
819 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
820 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
821 IPSTATS_MIB_FRAGFAILS);
827 * Set up data on packet
830 ip6_copy_metadata(frag, skb);
831 skb_reserve(frag, hroom);
832 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
833 skb_reset_network_header(frag);
834 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
835 frag->transport_header = (frag->network_header + hlen +
836 sizeof(struct frag_hdr));
839 * Charge the memory for the fragment to any owner
843 skb_set_owner_w(frag, skb->sk);
846 * Copy the packet header into the new buffer.
848 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
851 * Build fragment header.
853 fh->nexthdr = nexthdr;
856 ipv6_select_ident(fh, rt);
857 frag_id = fh->identification;
859 fh->identification = frag_id;
862 * Copy a block of the IP datagram.
864 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
868 fh->frag_off = htons(offset);
870 fh->frag_off |= htons(IP6_MF);
871 ipv6_hdr(frag)->payload_len = htons(frag->len -
872 sizeof(struct ipv6hdr));
878 * Put this fragment into the sending queue.
884 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885 IPSTATS_MIB_FRAGCREATES);
887 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888 IPSTATS_MIB_FRAGOKS);
893 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894 IPSTATS_MIB_FRAGFAILS);
899 static inline int ip6_rt_check(const struct rt6key *rt_key,
900 const struct in6_addr *fl_addr,
901 const struct in6_addr *addr_cache)
903 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
904 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
907 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
908 struct dst_entry *dst,
909 const struct flowi6 *fl6)
911 struct ipv6_pinfo *np = inet6_sk(sk);
912 struct rt6_info *rt = (struct rt6_info *)dst;
917 /* Yes, checking route validity in not connected
918 * case is not very simple. Take into account,
919 * that we do not support routing by source, TOS,
920 * and MSG_DONTROUTE --ANK (980726)
922 * 1. ip6_rt_check(): If route was host route,
923 * check that cached destination is current.
924 * If it is network route, we still may
925 * check its validity using saved pointer
926 * to the last used address: daddr_cache.
927 * We do not want to save whole address now,
928 * (because main consumer of this service
929 * is tcp, which has not this problem),
930 * so that the last trick works only on connected
932 * 2. oif also should be the same.
934 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
935 #ifdef CONFIG_IPV6_SUBTREES
936 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
938 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
947 static int ip6_dst_lookup_tail(struct sock *sk,
948 struct dst_entry **dst, struct flowi6 *fl6)
950 struct net *net = sock_net(sk);
951 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
958 *dst = ip6_route_output(net, sk, fl6);
960 if ((err = (*dst)->error))
961 goto out_err_release;
963 if (ipv6_addr_any(&fl6->saddr)) {
964 struct rt6_info *rt = (struct rt6_info *) *dst;
965 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
966 sk ? inet6_sk(sk)->srcprefs : 0,
969 goto out_err_release;
972 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
974 * Here if the dst entry we've looked up
975 * has a neighbour entry that is in the INCOMPLETE
976 * state and the src address from the flow is
977 * marked as OPTIMISTIC, we release the found
978 * dst entry and replace it instead with the
979 * dst entry of the nexthop router
981 rt = (struct rt6_info *) *dst;
983 if (n && !(n->nud_state & NUD_VALID)) {
984 struct inet6_ifaddr *ifp;
985 struct flowi6 fl_gw6;
988 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
991 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
997 * We need to get the dst entry for the
998 * default router instead
1001 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1002 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1003 *dst = ip6_route_output(net, sk, &fl_gw6);
1004 if ((err = (*dst)->error))
1005 goto out_err_release;
1013 if (err == -ENETUNREACH)
1014 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1021 * ip6_dst_lookup - perform route lookup on flow
1022 * @sk: socket which provides route info
1023 * @dst: pointer to dst_entry * for result
1024 * @fl6: flow to lookup
1026 * This function performs a route lookup on the given flow.
1028 * It returns zero on success, or a standard errno code on error.
1030 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1033 return ip6_dst_lookup_tail(sk, dst, fl6);
1035 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1038 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1039 * @sk: socket which provides route info
1040 * @fl6: flow to lookup
1041 * @final_dst: final destination address for ipsec lookup
1042 * @can_sleep: we are in a sleepable context
1044 * This function performs a route lookup on the given flow.
1046 * It returns a valid dst pointer on success, or a pointer encoded
1049 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1050 const struct in6_addr *final_dst,
1053 struct dst_entry *dst = NULL;
1056 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1058 return ERR_PTR(err);
1060 fl6->daddr = *final_dst;
1062 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1064 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1066 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1069 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1070 * @sk: socket which provides the dst cache and route info
1071 * @fl6: flow to lookup
1072 * @final_dst: final destination address for ipsec lookup
1073 * @can_sleep: we are in a sleepable context
1075 * This function performs a route lookup on the given flow with the
1076 * possibility of using the cached route in the socket if it is valid.
1077 * It will take the socket dst lock when operating on the dst cache.
1078 * As a result, this function can only be used in process context.
1080 * It returns a valid dst pointer on success, or a pointer encoded
1083 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1084 const struct in6_addr *final_dst,
1087 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1090 dst = ip6_sk_dst_check(sk, dst, fl6);
1092 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1094 return ERR_PTR(err);
1096 fl6->daddr = *final_dst;
1098 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1100 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1102 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1104 static inline int ip6_ufo_append_data(struct sock *sk,
1105 int getfrag(void *from, char *to, int offset, int len,
1106 int odd, struct sk_buff *skb),
1107 void *from, int length, int hh_len, int fragheaderlen,
1108 int transhdrlen, int mtu,unsigned int flags,
1109 struct rt6_info *rt)
1112 struct sk_buff *skb;
1115 /* There is support for UDP large send offload by network
1116 * device, so create one single skb packet containing complete
1119 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1120 skb = sock_alloc_send_skb(sk,
1121 hh_len + fragheaderlen + transhdrlen + 20,
1122 (flags & MSG_DONTWAIT), &err);
1126 /* reserve space for Hardware header */
1127 skb_reserve(skb, hh_len);
1129 /* create space for UDP/IP header */
1130 skb_put(skb,fragheaderlen + transhdrlen);
1132 /* initialize network header pointer */
1133 skb_reset_network_header(skb);
1135 /* initialize protocol header pointer */
1136 skb->transport_header = skb->network_header + fragheaderlen;
1138 skb->ip_summed = CHECKSUM_PARTIAL;
1142 err = skb_append_datato_frags(sk,skb, getfrag, from,
1143 (length - transhdrlen));
1145 struct frag_hdr fhdr;
1147 /* Specify the length of each IPv6 datagram fragment.
1148 * It has to be a multiple of 8.
1150 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1151 sizeof(struct frag_hdr)) & ~7;
1152 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1153 ipv6_select_ident(&fhdr, rt);
1154 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1155 __skb_queue_tail(&sk->sk_write_queue, skb);
1159 /* There is not enough support do UPD LSO,
1160 * so follow normal path
1167 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1170 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1176 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1179 static void ip6_append_data_mtu(int *mtu,
1181 unsigned int fragheaderlen,
1182 struct sk_buff *skb,
1183 struct rt6_info *rt)
1185 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1187 /* first fragment, reserve header_len */
1188 *mtu = *mtu - rt->dst.header_len;
1192 * this fragment is not first, the headers
1193 * space is regarded as data space.
1195 *mtu = dst_mtu(rt->dst.path);
1197 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1198 + fragheaderlen - sizeof(struct frag_hdr);
1202 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1203 int offset, int len, int odd, struct sk_buff *skb),
1204 void *from, int length, int transhdrlen,
1205 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1206 struct rt6_info *rt, unsigned int flags, int dontfrag)
1208 struct inet_sock *inet = inet_sk(sk);
1209 struct ipv6_pinfo *np = inet6_sk(sk);
1210 struct inet_cork *cork;
1211 struct sk_buff *skb, *skb_prev = NULL;
1212 unsigned int maxfraglen, fragheaderlen;
1222 if (flags&MSG_PROBE)
1224 cork = &inet->cork.base;
1225 if (skb_queue_empty(&sk->sk_write_queue)) {
1230 if (WARN_ON(np->cork.opt))
1233 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1234 if (unlikely(np->cork.opt == NULL))
1237 np->cork.opt->tot_len = opt->tot_len;
1238 np->cork.opt->opt_flen = opt->opt_flen;
1239 np->cork.opt->opt_nflen = opt->opt_nflen;
1241 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1243 if (opt->dst0opt && !np->cork.opt->dst0opt)
1246 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1248 if (opt->dst1opt && !np->cork.opt->dst1opt)
1251 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1253 if (opt->hopopt && !np->cork.opt->hopopt)
1256 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1258 if (opt->srcrt && !np->cork.opt->srcrt)
1261 /* need source address above miyazawa*/
1264 cork->dst = &rt->dst;
1265 inet->cork.fl.u.ip6 = *fl6;
1266 np->cork.hop_limit = hlimit;
1267 np->cork.tclass = tclass;
1268 if (rt->dst.flags & DST_XFRM_TUNNEL)
1269 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1270 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1272 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1273 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1274 if (np->frag_size < mtu) {
1276 mtu = np->frag_size;
1278 cork->fragsize = mtu;
1279 if (dst_allfrag(rt->dst.path))
1280 cork->flags |= IPCORK_ALLFRAG;
1282 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1283 length += exthdrlen;
1284 transhdrlen += exthdrlen;
1285 dst_exthdrlen = rt->dst.header_len;
1287 rt = (struct rt6_info *)cork->dst;
1288 fl6 = &inet->cork.fl.u.ip6;
1293 mtu = cork->fragsize;
1296 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1298 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1299 (opt ? opt->opt_nflen : 0);
1300 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1302 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1303 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1304 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1309 /* For UDP, check if TX timestamp is enabled */
1310 if (sk->sk_type == SOCK_DGRAM) {
1311 err = sock_tx_timestamp(sk, &tx_flags);
1317 * Let's try using as much space as possible.
1318 * Use MTU if total length of the message fits into the MTU.
1319 * Otherwise, we need to reserve fragment header and
1320 * fragment alignment (= 8-15 octects, in total).
1322 * Note that we may need to "move" the data from the tail of
1323 * of the buffer to the new fragment when we split
1326 * FIXME: It may be fragmented into multiple chunks
1327 * at once if non-fragmentable extension headers
1332 cork->length += length;
1334 int proto = sk->sk_protocol;
1335 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1336 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1340 if (proto == IPPROTO_UDP &&
1341 (rt->dst.dev->features & NETIF_F_UFO)) {
1343 err = ip6_ufo_append_data(sk, getfrag, from, length,
1344 hh_len, fragheaderlen,
1345 transhdrlen, mtu, flags, rt);
1352 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1355 while (length > 0) {
1356 /* Check if the remaining data fits into current packet. */
1357 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1359 copy = maxfraglen - skb->len;
1363 unsigned int datalen;
1364 unsigned int fraglen;
1365 unsigned int fraggap;
1366 unsigned int alloclen;
1368 /* There's no room in the current skb */
1370 fraggap = skb->len - maxfraglen;
1373 /* update mtu and maxfraglen if necessary */
1374 if (skb == NULL || skb_prev == NULL)
1375 ip6_append_data_mtu(&mtu, &maxfraglen,
1376 fragheaderlen, skb, rt);
1381 * If remaining data exceeds the mtu,
1382 * we know we need more fragment(s).
1384 datalen = length + fraggap;
1386 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1387 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1388 if ((flags & MSG_MORE) &&
1389 !(rt->dst.dev->features&NETIF_F_SG))
1392 alloclen = datalen + fragheaderlen;
1394 alloclen += dst_exthdrlen;
1396 if (datalen != length + fraggap) {
1398 * this is not the last fragment, the trailer
1399 * space is regarded as data space.
1401 datalen += rt->dst.trailer_len;
1404 alloclen += rt->dst.trailer_len;
1405 fraglen = datalen + fragheaderlen;
1408 * We just reserve space for fragment header.
1409 * Note: this may be overallocation if the message
1410 * (without MSG_MORE) fits into the MTU.
1412 alloclen += sizeof(struct frag_hdr);
1415 skb = sock_alloc_send_skb(sk,
1417 (flags & MSG_DONTWAIT), &err);
1420 if (atomic_read(&sk->sk_wmem_alloc) <=
1422 skb = sock_wmalloc(sk,
1423 alloclen + hh_len, 1,
1425 if (unlikely(skb == NULL))
1428 /* Only the initial fragment
1437 * Fill in the control structures
1439 skb->ip_summed = CHECKSUM_NONE;
1441 /* reserve for fragmentation and ipsec header */
1442 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1445 if (sk->sk_type == SOCK_DGRAM)
1446 skb_shinfo(skb)->tx_flags = tx_flags;
1449 * Find where to start putting bytes
1451 data = skb_put(skb, fraglen);
1452 skb_set_network_header(skb, exthdrlen);
1453 data += fragheaderlen;
1454 skb->transport_header = (skb->network_header +
1457 skb->csum = skb_copy_and_csum_bits(
1458 skb_prev, maxfraglen,
1459 data + transhdrlen, fraggap, 0);
1460 skb_prev->csum = csum_sub(skb_prev->csum,
1463 pskb_trim_unique(skb_prev, maxfraglen);
1465 copy = datalen - transhdrlen - fraggap;
1471 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1478 length -= datalen - fraggap;
1484 * Put the packet on the pending queue
1486 __skb_queue_tail(&sk->sk_write_queue, skb);
1493 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1497 if (getfrag(from, skb_put(skb, copy),
1498 offset, copy, off, skb) < 0) {
1499 __skb_trim(skb, off);
1504 int i = skb_shinfo(skb)->nr_frags;
1505 struct page_frag *pfrag = sk_page_frag(sk);
1508 if (!sk_page_frag_refill(sk, pfrag))
1511 if (!skb_can_coalesce(skb, i, pfrag->page,
1514 if (i == MAX_SKB_FRAGS)
1517 __skb_fill_page_desc(skb, i, pfrag->page,
1519 skb_shinfo(skb)->nr_frags = ++i;
1520 get_page(pfrag->page);
1522 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1524 page_address(pfrag->page) + pfrag->offset,
1525 offset, copy, skb->len, skb) < 0)
1528 pfrag->offset += copy;
1529 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1531 skb->data_len += copy;
1532 skb->truesize += copy;
1533 atomic_add(copy, &sk->sk_wmem_alloc);
1544 cork->length -= length;
1545 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1548 EXPORT_SYMBOL_GPL(ip6_append_data);
1550 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1553 kfree(np->cork.opt->dst0opt);
1554 kfree(np->cork.opt->dst1opt);
1555 kfree(np->cork.opt->hopopt);
1556 kfree(np->cork.opt->srcrt);
1557 kfree(np->cork.opt);
1558 np->cork.opt = NULL;
1561 if (inet->cork.base.dst) {
1562 dst_release(inet->cork.base.dst);
1563 inet->cork.base.dst = NULL;
1564 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1566 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1569 int ip6_push_pending_frames(struct sock *sk)
1571 struct sk_buff *skb, *tmp_skb;
1572 struct sk_buff **tail_skb;
1573 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1574 struct inet_sock *inet = inet_sk(sk);
1575 struct ipv6_pinfo *np = inet6_sk(sk);
1576 struct net *net = sock_net(sk);
1577 struct ipv6hdr *hdr;
1578 struct ipv6_txoptions *opt = np->cork.opt;
1579 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1580 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1581 unsigned char proto = fl6->flowi6_proto;
1584 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1586 tail_skb = &(skb_shinfo(skb)->frag_list);
1588 /* move skb->data to ip header from ext header */
1589 if (skb->data < skb_network_header(skb))
1590 __skb_pull(skb, skb_network_offset(skb));
1591 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1592 __skb_pull(tmp_skb, skb_network_header_len(skb));
1593 *tail_skb = tmp_skb;
1594 tail_skb = &(tmp_skb->next);
1595 skb->len += tmp_skb->len;
1596 skb->data_len += tmp_skb->len;
1597 skb->truesize += tmp_skb->truesize;
1598 tmp_skb->destructor = NULL;
1602 /* Allow local fragmentation. */
1603 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1606 *final_dst = fl6->daddr;
1607 __skb_pull(skb, skb_network_header_len(skb));
1608 if (opt && opt->opt_flen)
1609 ipv6_push_frag_opts(skb, opt, &proto);
1610 if (opt && opt->opt_nflen)
1611 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1613 skb_push(skb, sizeof(struct ipv6hdr));
1614 skb_reset_network_header(skb);
1615 hdr = ipv6_hdr(skb);
1617 *(__be32*)hdr = fl6->flowlabel |
1618 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1620 hdr->hop_limit = np->cork.hop_limit;
1621 hdr->nexthdr = proto;
1622 hdr->saddr = fl6->saddr;
1623 hdr->daddr = *final_dst;
1625 skb->priority = sk->sk_priority;
1626 skb->mark = sk->sk_mark;
1628 skb_dst_set(skb, dst_clone(&rt->dst));
1629 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1630 if (proto == IPPROTO_ICMPV6) {
1631 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1633 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1634 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1637 err = ip6_local_out(skb);
1640 err = net_xmit_errno(err);
1646 ip6_cork_release(inet, np);
1649 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1652 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1654 void ip6_flush_pending_frames(struct sock *sk)
1656 struct sk_buff *skb;
1658 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1660 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1661 IPSTATS_MIB_OUTDISCARDS);
1665 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1667 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);