2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 int __ip6_local_out(struct sk_buff *skb)
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
68 ipv6_hdr(skb)->payload_len = htons(len);
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
74 int ip6_local_out(struct sk_buff *skb)
78 err = __ip6_local_out(skb);
80 err = dst_output(skb);
84 EXPORT_SYMBOL_GPL(ip6_local_out);
86 static int ip6_finish_output2(struct sk_buff *skb)
88 struct dst_entry *dst = skb_dst(skb);
89 struct net_device *dev = dst->dev;
90 struct neighbour *neigh;
93 skb->protocol = htons(ETH_P_IPV6);
96 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
97 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
99 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
100 ((mroute6_socket(dev_net(dev), skb) &&
101 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
102 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
103 &ipv6_hdr(skb)->saddr))) {
104 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
106 /* Do not check for IFF_ALLMULTI; multicast routing
107 is not supported in any case.
110 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
111 newskb, NULL, newskb->dev,
114 if (ipv6_hdr(skb)->hop_limit == 0) {
115 IP6_INC_STATS(dev_net(dev), idev,
116 IPSTATS_MIB_OUTDISCARDS);
122 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
126 rt = (struct rt6_info *) dst;
129 return dst_neigh_output(dst, neigh, skb);
131 IP6_INC_STATS_BH(dev_net(dst->dev),
132 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
137 static int ip6_finish_output(struct sk_buff *skb)
139 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
140 dst_allfrag(skb_dst(skb)))
141 return ip6_fragment(skb, ip6_finish_output2);
143 return ip6_finish_output2(skb);
146 int ip6_output(struct sk_buff *skb)
148 struct net_device *dev = skb_dst(skb)->dev;
149 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
150 if (unlikely(idev->cnf.disable_ipv6)) {
151 IP6_INC_STATS(dev_net(dev), idev,
152 IPSTATS_MIB_OUTDISCARDS);
157 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
159 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
163 * xmit an sk_buff (used by TCP, SCTP and DCCP)
166 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
167 struct ipv6_txoptions *opt, int tclass)
169 struct net *net = sock_net(sk);
170 struct ipv6_pinfo *np = inet6_sk(sk);
171 struct in6_addr *first_hop = &fl6->daddr;
172 struct dst_entry *dst = skb_dst(skb);
174 u8 proto = fl6->flowi6_proto;
175 int seg_len = skb->len;
180 unsigned int head_room;
182 /* First: exthdrs may take lots of space (~8K for now)
183 MAX_HEADER is not enough.
185 head_room = opt->opt_nflen + opt->opt_flen;
186 seg_len += head_room;
187 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
189 if (skb_headroom(skb) < head_room) {
190 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
192 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
193 IPSTATS_MIB_OUTDISCARDS);
199 skb_set_owner_w(skb, sk);
202 ipv6_push_frag_opts(skb, opt, &proto);
204 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
207 skb_push(skb, sizeof(struct ipv6hdr));
208 skb_reset_network_header(skb);
212 * Fill in the IPv6 header
215 hlimit = np->hop_limit;
217 hlimit = ip6_dst_hoplimit(dst);
219 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
221 hdr->payload_len = htons(seg_len);
222 hdr->nexthdr = proto;
223 hdr->hop_limit = hlimit;
225 hdr->saddr = fl6->saddr;
226 hdr->daddr = *first_hop;
228 skb->priority = sk->sk_priority;
229 skb->mark = sk->sk_mark;
232 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
233 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
234 IPSTATS_MIB_OUT, skb->len);
235 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
236 dst->dev, dst_output);
239 net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
241 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
242 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
247 EXPORT_SYMBOL(ip6_xmit);
250 * To avoid extra problems ND packets are send through this
251 * routine. It's code duplication but I really want to avoid
252 * extra checks since ipv6_build_header is used by TCP (which
253 * is for us performance critical)
256 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
257 const struct in6_addr *saddr, const struct in6_addr *daddr,
260 struct ipv6_pinfo *np = inet6_sk(sk);
263 skb->protocol = htons(ETH_P_IPV6);
266 skb_reset_network_header(skb);
267 skb_put(skb, sizeof(struct ipv6hdr));
270 *(__be32*)hdr = htonl(0x60000000);
272 hdr->payload_len = htons(len);
273 hdr->nexthdr = proto;
274 hdr->hop_limit = np->hop_limit;
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
284 struct ip6_ra_chain *ra;
285 struct sock *last = NULL;
287 read_lock(&ip6_ra_lock);
288 for (ra = ip6_ra_chain; ra; ra = ra->next) {
289 struct sock *sk = ra->sk;
290 if (sk && ra->sel == sel &&
291 (!sk->sk_bound_dev_if ||
292 sk->sk_bound_dev_if == skb->dev->ifindex)) {
294 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
296 rawv6_rcv(last, skb2);
303 rawv6_rcv(last, skb);
304 read_unlock(&ip6_ra_lock);
307 read_unlock(&ip6_ra_lock);
311 static int ip6_forward_proxy_check(struct sk_buff *skb)
313 struct ipv6hdr *hdr = ipv6_hdr(skb);
314 u8 nexthdr = hdr->nexthdr;
318 if (ipv6_ext_hdr(nexthdr)) {
319 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
323 offset = sizeof(struct ipv6hdr);
325 if (nexthdr == IPPROTO_ICMPV6) {
326 struct icmp6hdr *icmp6;
328 if (!pskb_may_pull(skb, (skb_network_header(skb) +
329 offset + 1 - skb->data)))
332 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
334 switch (icmp6->icmp6_type) {
335 case NDISC_ROUTER_SOLICITATION:
336 case NDISC_ROUTER_ADVERTISEMENT:
337 case NDISC_NEIGHBOUR_SOLICITATION:
338 case NDISC_NEIGHBOUR_ADVERTISEMENT:
340 /* For reaction involving unicast neighbor discovery
341 * message destined to the proxied address, pass it to
351 * The proxying router can't forward traffic sent to a link-local
352 * address, so signal the sender and discard the packet. This
353 * behavior is clarified by the MIPv6 specification.
355 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
356 dst_link_failure(skb);
363 static inline int ip6_forward_finish(struct sk_buff *skb)
365 return dst_output(skb);
368 int ip6_forward(struct sk_buff *skb)
370 struct dst_entry *dst = skb_dst(skb);
371 struct ipv6hdr *hdr = ipv6_hdr(skb);
372 struct inet6_skb_parm *opt = IP6CB(skb);
373 struct net *net = dev_net(dst->dev);
376 if (net->ipv6.devconf_all->forwarding == 0)
379 if (skb_warn_if_lro(skb))
382 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
387 if (skb->pkt_type != PACKET_HOST)
390 skb_forward_csum(skb);
393 * We DO NOT make any processing on
394 * RA packets, pushing them to user level AS IS
395 * without ane WARRANTY that application will be able
396 * to interpret them. The reason is that we
397 * cannot make anything clever here.
399 * We are not end-node, so that if packet contains
400 * AH/ESP, we cannot make anything.
401 * Defragmentation also would be mistake, RA packets
402 * cannot be fragmented, because there is no warranty
403 * that different fragments will go along one path. --ANK
406 u8 *ptr = skb_network_header(skb) + opt->ra;
407 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
412 * check and decrement ttl
414 if (hdr->hop_limit <= 1) {
415 /* Force OUTPUT device used as source address */
417 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418 IP6_INC_STATS_BH(net,
419 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
425 /* XXX: idev->cnf.proxy_ndp? */
426 if (net->ipv6.devconf_all->proxy_ndp &&
427 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428 int proxied = ip6_forward_proxy_check(skb);
430 return ip6_input(skb);
431 else if (proxied < 0) {
432 IP6_INC_STATS(net, ip6_dst_idev(dst),
433 IPSTATS_MIB_INDISCARDS);
438 if (!xfrm6_route_forward(skb)) {
439 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
444 /* IPv6 specs say nothing about it, but it is clear that we cannot
445 send redirects to source routed frames.
446 We don't send redirects to frames decapsulated from IPsec.
448 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
449 struct in6_addr *target = NULL;
450 struct inet_peer *peer;
454 * incoming and outgoing devices are the same
458 rt = (struct rt6_info *) dst;
459 if (rt->rt6i_flags & RTF_GATEWAY)
460 target = &rt->rt6i_gateway;
462 target = &hdr->daddr;
464 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
466 /* Limit redirects both by destination (here)
467 and by source (inside ndisc_send_redirect)
469 if (inet_peer_xrlim_allow(peer, 1*HZ))
470 ndisc_send_redirect(skb, target);
474 int addrtype = ipv6_addr_type(&hdr->saddr);
476 /* This check is security critical. */
477 if (addrtype == IPV6_ADDR_ANY ||
478 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
480 if (addrtype & IPV6_ADDR_LINKLOCAL) {
481 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
482 ICMPV6_NOT_NEIGHBOUR, 0);
488 if (mtu < IPV6_MIN_MTU)
491 if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
492 (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
493 /* Again, force OUTPUT device used as source address */
495 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496 IP6_INC_STATS_BH(net,
497 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
498 IP6_INC_STATS_BH(net,
499 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
504 if (skb_cow(skb, dst->dev->hard_header_len)) {
505 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
511 /* Mangling hops number delayed to point after skb COW */
515 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
516 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
517 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
521 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
527 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
529 to->pkt_type = from->pkt_type;
530 to->priority = from->priority;
531 to->protocol = from->protocol;
533 skb_dst_set(to, dst_clone(skb_dst(from)));
535 to->mark = from->mark;
537 #ifdef CONFIG_NET_SCHED
538 to->tc_index = from->tc_index;
541 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
542 to->nf_trace = from->nf_trace;
544 skb_copy_secmark(to, from);
547 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
549 u16 offset = sizeof(struct ipv6hdr);
550 struct ipv6_opt_hdr *exthdr =
551 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
552 unsigned int packet_len = skb->tail - skb->network_header;
554 *nexthdr = &ipv6_hdr(skb)->nexthdr;
556 while (offset + 1 <= packet_len) {
562 case NEXTHDR_ROUTING:
566 #if IS_ENABLED(CONFIG_IPV6_MIP6)
567 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
577 offset += ipv6_optlen(exthdr);
578 *nexthdr = &exthdr->nexthdr;
579 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
586 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
588 static atomic_t ipv6_fragmentation_id;
591 if (rt && !(rt->dst.flags & DST_NOPEER)) {
592 struct inet_peer *peer;
595 net = dev_net(rt->dst.dev);
596 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
598 fhdr->identification = htonl(inet_getid(peer, 0));
604 old = atomic_read(&ipv6_fragmentation_id);
608 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
609 fhdr->identification = htonl(new);
612 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
614 struct sk_buff *frag;
615 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
616 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
617 struct ipv6hdr *tmp_hdr;
619 unsigned int mtu, hlen, left, len;
622 int ptr, offset = 0, err=0;
623 u8 *prevhdr, nexthdr = 0;
624 struct net *net = dev_net(skb_dst(skb)->dev);
626 hlen = ip6_find_1stfragopt(skb, &prevhdr);
629 mtu = ip6_skb_dst_mtu(skb);
631 /* We must not fragment if the socket is set to force MTU discovery
632 * or if the skb it not generated by a local socket.
634 if (unlikely(!skb->local_df && skb->len > mtu) ||
635 (IP6CB(skb)->frag_max_size &&
636 IP6CB(skb)->frag_max_size > mtu)) {
637 if (skb->sk && dst_allfrag(skb_dst(skb)))
638 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
640 skb->dev = skb_dst(skb)->dev;
641 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
642 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
643 IPSTATS_MIB_FRAGFAILS);
648 if (np && np->frag_size < mtu) {
652 mtu -= hlen + sizeof(struct frag_hdr);
654 if (skb_has_frag_list(skb)) {
655 int first_len = skb_pagelen(skb);
656 struct sk_buff *frag2;
658 if (first_len - hlen > mtu ||
659 ((first_len - hlen) & 7) ||
663 skb_walk_frags(skb, frag) {
664 /* Correct geometry. */
665 if (frag->len > mtu ||
666 ((frag->len & 7) && frag->next) ||
667 skb_headroom(frag) < hlen)
668 goto slow_path_clean;
670 /* Partially cloned skb? */
671 if (skb_shared(frag))
672 goto slow_path_clean;
677 frag->destructor = sock_wfree;
679 skb->truesize -= frag->truesize;
684 frag = skb_shinfo(skb)->frag_list;
685 skb_frag_list_init(skb);
688 *prevhdr = NEXTHDR_FRAGMENT;
689 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
692 IPSTATS_MIB_FRAGFAILS);
696 __skb_pull(skb, hlen);
697 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
698 __skb_push(skb, hlen);
699 skb_reset_network_header(skb);
700 memcpy(skb_network_header(skb), tmp_hdr, hlen);
702 ipv6_select_ident(fh, rt);
703 fh->nexthdr = nexthdr;
705 fh->frag_off = htons(IP6_MF);
706 frag_id = fh->identification;
708 first_len = skb_pagelen(skb);
709 skb->data_len = first_len - skb_headlen(skb);
710 skb->len = first_len;
711 ipv6_hdr(skb)->payload_len = htons(first_len -
712 sizeof(struct ipv6hdr));
717 /* Prepare header of the next frame,
718 * before previous one went down. */
720 frag->ip_summed = CHECKSUM_NONE;
721 skb_reset_transport_header(frag);
722 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
723 __skb_push(frag, hlen);
724 skb_reset_network_header(frag);
725 memcpy(skb_network_header(frag), tmp_hdr,
727 offset += skb->len - hlen - sizeof(struct frag_hdr);
728 fh->nexthdr = nexthdr;
730 fh->frag_off = htons(offset);
731 if (frag->next != NULL)
732 fh->frag_off |= htons(IP6_MF);
733 fh->identification = frag_id;
734 ipv6_hdr(frag)->payload_len =
736 sizeof(struct ipv6hdr));
737 ip6_copy_metadata(frag, skb);
742 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743 IPSTATS_MIB_FRAGCREATES);
756 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757 IPSTATS_MIB_FRAGOKS);
768 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
769 IPSTATS_MIB_FRAGFAILS);
774 skb_walk_frags(skb, frag2) {
778 frag2->destructor = NULL;
779 skb->truesize += frag2->truesize;
784 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
785 skb_checksum_help(skb))
788 left = skb->len - hlen; /* Space per frame */
789 ptr = hlen; /* Where to start from */
792 * Fragment the datagram.
795 *prevhdr = NEXTHDR_FRAGMENT;
796 hroom = LL_RESERVED_SPACE(rt->dst.dev);
797 troom = rt->dst.dev->needed_tailroom;
800 * Keep copying data until we run out.
804 /* IF: it doesn't fit, use 'mtu' - the data space left */
807 /* IF: we are not sending up to and including the packet end
808 then align the next start on an eight byte boundary */
816 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
817 hroom + troom, GFP_ATOMIC)) == NULL) {
818 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
819 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
820 IPSTATS_MIB_FRAGFAILS);
826 * Set up data on packet
829 ip6_copy_metadata(frag, skb);
830 skb_reserve(frag, hroom);
831 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
832 skb_reset_network_header(frag);
833 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
834 frag->transport_header = (frag->network_header + hlen +
835 sizeof(struct frag_hdr));
838 * Charge the memory for the fragment to any owner
842 skb_set_owner_w(frag, skb->sk);
845 * Copy the packet header into the new buffer.
847 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
850 * Build fragment header.
852 fh->nexthdr = nexthdr;
855 ipv6_select_ident(fh, rt);
856 frag_id = fh->identification;
858 fh->identification = frag_id;
861 * Copy a block of the IP datagram.
863 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
867 fh->frag_off = htons(offset);
869 fh->frag_off |= htons(IP6_MF);
870 ipv6_hdr(frag)->payload_len = htons(frag->len -
871 sizeof(struct ipv6hdr));
877 * Put this fragment into the sending queue.
883 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
884 IPSTATS_MIB_FRAGCREATES);
886 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
887 IPSTATS_MIB_FRAGOKS);
892 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
893 IPSTATS_MIB_FRAGFAILS);
898 static inline int ip6_rt_check(const struct rt6key *rt_key,
899 const struct in6_addr *fl_addr,
900 const struct in6_addr *addr_cache)
902 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
903 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
906 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
907 struct dst_entry *dst,
908 const struct flowi6 *fl6)
910 struct ipv6_pinfo *np = inet6_sk(sk);
911 struct rt6_info *rt = (struct rt6_info *)dst;
916 /* Yes, checking route validity in not connected
917 * case is not very simple. Take into account,
918 * that we do not support routing by source, TOS,
919 * and MSG_DONTROUTE --ANK (980726)
921 * 1. ip6_rt_check(): If route was host route,
922 * check that cached destination is current.
923 * If it is network route, we still may
924 * check its validity using saved pointer
925 * to the last used address: daddr_cache.
926 * We do not want to save whole address now,
927 * (because main consumer of this service
928 * is tcp, which has not this problem),
929 * so that the last trick works only on connected
931 * 2. oif also should be the same.
933 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
934 #ifdef CONFIG_IPV6_SUBTREES
935 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
937 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
946 static int ip6_dst_lookup_tail(struct sock *sk,
947 struct dst_entry **dst, struct flowi6 *fl6)
949 struct net *net = sock_net(sk);
950 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
957 *dst = ip6_route_output(net, sk, fl6);
959 if ((err = (*dst)->error))
960 goto out_err_release;
962 if (ipv6_addr_any(&fl6->saddr)) {
963 struct rt6_info *rt = (struct rt6_info *) *dst;
964 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
965 sk ? inet6_sk(sk)->srcprefs : 0,
968 goto out_err_release;
971 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
973 * Here if the dst entry we've looked up
974 * has a neighbour entry that is in the INCOMPLETE
975 * state and the src address from the flow is
976 * marked as OPTIMISTIC, we release the found
977 * dst entry and replace it instead with the
978 * dst entry of the nexthop router
980 rt = (struct rt6_info *) *dst;
982 if (n && !(n->nud_state & NUD_VALID)) {
983 struct inet6_ifaddr *ifp;
984 struct flowi6 fl_gw6;
987 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
990 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
996 * We need to get the dst entry for the
997 * default router instead
1000 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1001 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1002 *dst = ip6_route_output(net, sk, &fl_gw6);
1003 if ((err = (*dst)->error))
1004 goto out_err_release;
1012 if (err == -ENETUNREACH)
1013 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1020 * ip6_dst_lookup - perform route lookup on flow
1021 * @sk: socket which provides route info
1022 * @dst: pointer to dst_entry * for result
1023 * @fl6: flow to lookup
1025 * This function performs a route lookup on the given flow.
1027 * It returns zero on success, or a standard errno code on error.
1029 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1032 return ip6_dst_lookup_tail(sk, dst, fl6);
1034 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1037 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1038 * @sk: socket which provides route info
1039 * @fl6: flow to lookup
1040 * @final_dst: final destination address for ipsec lookup
1041 * @can_sleep: we are in a sleepable context
1043 * This function performs a route lookup on the given flow.
1045 * It returns a valid dst pointer on success, or a pointer encoded
1048 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1049 const struct in6_addr *final_dst,
1052 struct dst_entry *dst = NULL;
1055 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1057 return ERR_PTR(err);
1059 fl6->daddr = *final_dst;
1061 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1063 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1065 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1068 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1069 * @sk: socket which provides the dst cache and route info
1070 * @fl6: flow to lookup
1071 * @final_dst: final destination address for ipsec lookup
1072 * @can_sleep: we are in a sleepable context
1074 * This function performs a route lookup on the given flow with the
1075 * possibility of using the cached route in the socket if it is valid.
1076 * It will take the socket dst lock when operating on the dst cache.
1077 * As a result, this function can only be used in process context.
1079 * It returns a valid dst pointer on success, or a pointer encoded
1082 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1083 const struct in6_addr *final_dst,
1086 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1089 dst = ip6_sk_dst_check(sk, dst, fl6);
1091 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1093 return ERR_PTR(err);
1095 fl6->daddr = *final_dst;
1097 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1099 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1101 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1103 static inline int ip6_ufo_append_data(struct sock *sk,
1104 int getfrag(void *from, char *to, int offset, int len,
1105 int odd, struct sk_buff *skb),
1106 void *from, int length, int hh_len, int fragheaderlen,
1107 int transhdrlen, int mtu,unsigned int flags,
1108 struct rt6_info *rt)
1111 struct sk_buff *skb;
1114 /* There is support for UDP large send offload by network
1115 * device, so create one single skb packet containing complete
1118 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1119 skb = sock_alloc_send_skb(sk,
1120 hh_len + fragheaderlen + transhdrlen + 20,
1121 (flags & MSG_DONTWAIT), &err);
1125 /* reserve space for Hardware header */
1126 skb_reserve(skb, hh_len);
1128 /* create space for UDP/IP header */
1129 skb_put(skb,fragheaderlen + transhdrlen);
1131 /* initialize network header pointer */
1132 skb_reset_network_header(skb);
1134 /* initialize protocol header pointer */
1135 skb->transport_header = skb->network_header + fragheaderlen;
1137 skb->ip_summed = CHECKSUM_PARTIAL;
1141 err = skb_append_datato_frags(sk,skb, getfrag, from,
1142 (length - transhdrlen));
1144 struct frag_hdr fhdr;
1146 /* Specify the length of each IPv6 datagram fragment.
1147 * It has to be a multiple of 8.
1149 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1150 sizeof(struct frag_hdr)) & ~7;
1151 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1152 ipv6_select_ident(&fhdr, rt);
1153 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1154 __skb_queue_tail(&sk->sk_write_queue, skb);
1158 /* There is not enough support do UPD LSO,
1159 * so follow normal path
1166 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1169 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1172 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1175 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1178 static void ip6_append_data_mtu(int *mtu,
1180 unsigned int fragheaderlen,
1181 struct sk_buff *skb,
1182 struct rt6_info *rt)
1184 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1186 /* first fragment, reserve header_len */
1187 *mtu = *mtu - rt->dst.header_len;
1191 * this fragment is not first, the headers
1192 * space is regarded as data space.
1194 *mtu = dst_mtu(rt->dst.path);
1196 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1197 + fragheaderlen - sizeof(struct frag_hdr);
1201 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1202 int offset, int len, int odd, struct sk_buff *skb),
1203 void *from, int length, int transhdrlen,
1204 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1205 struct rt6_info *rt, unsigned int flags, int dontfrag)
1207 struct inet_sock *inet = inet_sk(sk);
1208 struct ipv6_pinfo *np = inet6_sk(sk);
1209 struct inet_cork *cork;
1210 struct sk_buff *skb, *skb_prev = NULL;
1211 unsigned int maxfraglen, fragheaderlen;
1221 if (flags&MSG_PROBE)
1223 cork = &inet->cork.base;
1224 if (skb_queue_empty(&sk->sk_write_queue)) {
1229 if (WARN_ON(np->cork.opt))
1232 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1233 if (unlikely(np->cork.opt == NULL))
1236 np->cork.opt->tot_len = opt->tot_len;
1237 np->cork.opt->opt_flen = opt->opt_flen;
1238 np->cork.opt->opt_nflen = opt->opt_nflen;
1240 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1242 if (opt->dst0opt && !np->cork.opt->dst0opt)
1245 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1247 if (opt->dst1opt && !np->cork.opt->dst1opt)
1250 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1252 if (opt->hopopt && !np->cork.opt->hopopt)
1255 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1257 if (opt->srcrt && !np->cork.opt->srcrt)
1260 /* need source address above miyazawa*/
1263 cork->dst = &rt->dst;
1264 inet->cork.fl.u.ip6 = *fl6;
1265 np->cork.hop_limit = hlimit;
1266 np->cork.tclass = tclass;
1267 if (rt->dst.flags & DST_XFRM_TUNNEL)
1268 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1269 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1271 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1272 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1273 if (np->frag_size < mtu) {
1275 mtu = np->frag_size;
1277 cork->fragsize = mtu;
1278 if (dst_allfrag(rt->dst.path))
1279 cork->flags |= IPCORK_ALLFRAG;
1281 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1282 length += exthdrlen;
1283 transhdrlen += exthdrlen;
1284 dst_exthdrlen = rt->dst.header_len;
1286 rt = (struct rt6_info *)cork->dst;
1287 fl6 = &inet->cork.fl.u.ip6;
1292 mtu = cork->fragsize;
1295 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1297 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1298 (opt ? opt->opt_nflen : 0);
1299 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1301 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1302 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1303 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1308 /* For UDP, check if TX timestamp is enabled */
1309 if (sk->sk_type == SOCK_DGRAM) {
1310 err = sock_tx_timestamp(sk, &tx_flags);
1316 * Let's try using as much space as possible.
1317 * Use MTU if total length of the message fits into the MTU.
1318 * Otherwise, we need to reserve fragment header and
1319 * fragment alignment (= 8-15 octects, in total).
1321 * Note that we may need to "move" the data from the tail of
1322 * of the buffer to the new fragment when we split
1325 * FIXME: It may be fragmented into multiple chunks
1326 * at once if non-fragmentable extension headers
1331 cork->length += length;
1333 int proto = sk->sk_protocol;
1334 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1335 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1339 if (proto == IPPROTO_UDP &&
1340 (rt->dst.dev->features & NETIF_F_UFO)) {
1342 err = ip6_ufo_append_data(sk, getfrag, from, length,
1343 hh_len, fragheaderlen,
1344 transhdrlen, mtu, flags, rt);
1351 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1354 while (length > 0) {
1355 /* Check if the remaining data fits into current packet. */
1356 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1358 copy = maxfraglen - skb->len;
1362 unsigned int datalen;
1363 unsigned int fraglen;
1364 unsigned int fraggap;
1365 unsigned int alloclen;
1367 /* There's no room in the current skb */
1369 fraggap = skb->len - maxfraglen;
1372 /* update mtu and maxfraglen if necessary */
1373 if (skb == NULL || skb_prev == NULL)
1374 ip6_append_data_mtu(&mtu, &maxfraglen,
1375 fragheaderlen, skb, rt);
1380 * If remaining data exceeds the mtu,
1381 * we know we need more fragment(s).
1383 datalen = length + fraggap;
1385 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1386 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1387 if ((flags & MSG_MORE) &&
1388 !(rt->dst.dev->features&NETIF_F_SG))
1391 alloclen = datalen + fragheaderlen;
1393 alloclen += dst_exthdrlen;
1395 if (datalen != length + fraggap) {
1397 * this is not the last fragment, the trailer
1398 * space is regarded as data space.
1400 datalen += rt->dst.trailer_len;
1403 alloclen += rt->dst.trailer_len;
1404 fraglen = datalen + fragheaderlen;
1407 * We just reserve space for fragment header.
1408 * Note: this may be overallocation if the message
1409 * (without MSG_MORE) fits into the MTU.
1411 alloclen += sizeof(struct frag_hdr);
1414 skb = sock_alloc_send_skb(sk,
1416 (flags & MSG_DONTWAIT), &err);
1419 if (atomic_read(&sk->sk_wmem_alloc) <=
1421 skb = sock_wmalloc(sk,
1422 alloclen + hh_len, 1,
1424 if (unlikely(skb == NULL))
1427 /* Only the initial fragment
1436 * Fill in the control structures
1438 skb->ip_summed = CHECKSUM_NONE;
1440 /* reserve for fragmentation and ipsec header */
1441 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1444 if (sk->sk_type == SOCK_DGRAM)
1445 skb_shinfo(skb)->tx_flags = tx_flags;
1448 * Find where to start putting bytes
1450 data = skb_put(skb, fraglen);
1451 skb_set_network_header(skb, exthdrlen);
1452 data += fragheaderlen;
1453 skb->transport_header = (skb->network_header +
1456 skb->csum = skb_copy_and_csum_bits(
1457 skb_prev, maxfraglen,
1458 data + transhdrlen, fraggap, 0);
1459 skb_prev->csum = csum_sub(skb_prev->csum,
1462 pskb_trim_unique(skb_prev, maxfraglen);
1464 copy = datalen - transhdrlen - fraggap;
1470 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1477 length -= datalen - fraggap;
1483 * Put the packet on the pending queue
1485 __skb_queue_tail(&sk->sk_write_queue, skb);
1492 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1496 if (getfrag(from, skb_put(skb, copy),
1497 offset, copy, off, skb) < 0) {
1498 __skb_trim(skb, off);
1503 int i = skb_shinfo(skb)->nr_frags;
1504 struct page_frag *pfrag = sk_page_frag(sk);
1507 if (!sk_page_frag_refill(sk, pfrag))
1510 if (!skb_can_coalesce(skb, i, pfrag->page,
1513 if (i == MAX_SKB_FRAGS)
1516 __skb_fill_page_desc(skb, i, pfrag->page,
1518 skb_shinfo(skb)->nr_frags = ++i;
1519 get_page(pfrag->page);
1521 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1523 page_address(pfrag->page) + pfrag->offset,
1524 offset, copy, skb->len, skb) < 0)
1527 pfrag->offset += copy;
1528 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1530 skb->data_len += copy;
1531 skb->truesize += copy;
1532 atomic_add(copy, &sk->sk_wmem_alloc);
1543 cork->length -= length;
1544 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1547 EXPORT_SYMBOL_GPL(ip6_append_data);
1549 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1552 kfree(np->cork.opt->dst0opt);
1553 kfree(np->cork.opt->dst1opt);
1554 kfree(np->cork.opt->hopopt);
1555 kfree(np->cork.opt->srcrt);
1556 kfree(np->cork.opt);
1557 np->cork.opt = NULL;
1560 if (inet->cork.base.dst) {
1561 dst_release(inet->cork.base.dst);
1562 inet->cork.base.dst = NULL;
1563 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1565 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1568 int ip6_push_pending_frames(struct sock *sk)
1570 struct sk_buff *skb, *tmp_skb;
1571 struct sk_buff **tail_skb;
1572 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1573 struct inet_sock *inet = inet_sk(sk);
1574 struct ipv6_pinfo *np = inet6_sk(sk);
1575 struct net *net = sock_net(sk);
1576 struct ipv6hdr *hdr;
1577 struct ipv6_txoptions *opt = np->cork.opt;
1578 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1579 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1580 unsigned char proto = fl6->flowi6_proto;
1583 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1585 tail_skb = &(skb_shinfo(skb)->frag_list);
1587 /* move skb->data to ip header from ext header */
1588 if (skb->data < skb_network_header(skb))
1589 __skb_pull(skb, skb_network_offset(skb));
1590 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1591 __skb_pull(tmp_skb, skb_network_header_len(skb));
1592 *tail_skb = tmp_skb;
1593 tail_skb = &(tmp_skb->next);
1594 skb->len += tmp_skb->len;
1595 skb->data_len += tmp_skb->len;
1596 skb->truesize += tmp_skb->truesize;
1597 tmp_skb->destructor = NULL;
1601 /* Allow local fragmentation. */
1602 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1605 *final_dst = fl6->daddr;
1606 __skb_pull(skb, skb_network_header_len(skb));
1607 if (opt && opt->opt_flen)
1608 ipv6_push_frag_opts(skb, opt, &proto);
1609 if (opt && opt->opt_nflen)
1610 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1612 skb_push(skb, sizeof(struct ipv6hdr));
1613 skb_reset_network_header(skb);
1614 hdr = ipv6_hdr(skb);
1616 *(__be32*)hdr = fl6->flowlabel |
1617 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1619 hdr->hop_limit = np->cork.hop_limit;
1620 hdr->nexthdr = proto;
1621 hdr->saddr = fl6->saddr;
1622 hdr->daddr = *final_dst;
1624 skb->priority = sk->sk_priority;
1625 skb->mark = sk->sk_mark;
1627 skb_dst_set(skb, dst_clone(&rt->dst));
1628 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1629 if (proto == IPPROTO_ICMPV6) {
1630 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1632 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1633 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1636 err = ip6_local_out(skb);
1639 err = net_xmit_errno(err);
1645 ip6_cork_release(inet, np);
1648 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1651 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1653 void ip6_flush_pending_frames(struct sock *sk)
1655 struct sk_buff *skb;
1657 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1659 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1660 IPSTATS_MIB_OUTDISCARDS);
1664 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1666 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);