2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 static u32 ipv6_fragmentation_id = 1;
63 static DEFINE_SPINLOCK(ip6_id_lock);
65 spin_lock_bh(&ip6_id_lock);
66 fhdr->identification = htonl(ipv6_fragmentation_id);
67 if (++ipv6_fragmentation_id == 0)
68 ipv6_fragmentation_id = 1;
69 spin_unlock_bh(&ip6_id_lock);
72 int __ip6_local_out(struct sk_buff *skb)
76 len = skb->len - sizeof(struct ipv6hdr);
77 if (len > IPV6_MAXPLEN)
79 ipv6_hdr(skb)->payload_len = htons(len);
81 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
85 int ip6_local_out(struct sk_buff *skb)
89 err = __ip6_local_out(skb);
91 err = dst_output(skb);
95 EXPORT_SYMBOL_GPL(ip6_local_out);
97 static int ip6_output_finish(struct sk_buff *skb)
99 struct dst_entry *dst = skb->dst;
102 return neigh_hh_output(dst->hh, skb);
103 else if (dst->neighbour)
104 return dst->neighbour->output(skb);
106 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
112 /* dev_loopback_xmit for use with netfilter. */
113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
115 skb_reset_mac_header(newskb);
116 __skb_pull(newskb, skb_network_offset(newskb));
117 newskb->pkt_type = PACKET_LOOPBACK;
118 newskb->ip_summed = CHECKSUM_UNNECESSARY;
119 BUG_TRAP(newskb->dst);
126 static int ip6_output2(struct sk_buff *skb)
128 struct dst_entry *dst = skb->dst;
129 struct net_device *dev = dst->dev;
131 skb->protocol = htons(ETH_P_IPV6);
134 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
135 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
136 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
138 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
139 ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
140 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 &ipv6_hdr(skb)->saddr))) {
142 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
144 /* Do not check for IFF_ALLMULTI; multicast routing
145 is not supported in any case.
148 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
150 ip6_dev_loopback_xmit);
152 if (ipv6_hdr(skb)->hop_limit == 0) {
153 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
159 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
162 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
168 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
170 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171 skb->dst->dev->mtu : dst_mtu(skb->dst);
174 int ip6_output(struct sk_buff *skb)
176 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
177 dst_allfrag(skb->dst))
178 return ip6_fragment(skb, ip6_output2);
180 return ip6_output2(skb);
184 * xmit an sk_buff (used by TCP)
187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188 struct ipv6_txoptions *opt, int ipfragok)
190 struct ipv6_pinfo *np = inet6_sk(sk);
191 struct in6_addr *first_hop = &fl->fl6_dst;
192 struct dst_entry *dst = skb->dst;
194 u8 proto = fl->proto;
195 int seg_len = skb->len;
200 unsigned int head_room;
202 /* First: exthdrs may take lots of space (~8K for now)
203 MAX_HEADER is not enough.
205 head_room = opt->opt_nflen + opt->opt_flen;
206 seg_len += head_room;
207 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
209 if (skb_headroom(skb) < head_room) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
212 IP6_INC_STATS(ip6_dst_idev(skb->dst),
213 IPSTATS_MIB_OUTDISCARDS);
220 skb_set_owner_w(skb, sk);
223 ipv6_push_frag_opts(skb, opt, &proto);
225 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228 skb_push(skb, sizeof(struct ipv6hdr));
229 skb_reset_network_header(skb);
233 * Fill in the IPv6 header
238 hlimit = np->hop_limit;
240 hlimit = ip6_dst_hoplimit(dst);
248 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
250 hdr->payload_len = htons(seg_len);
251 hdr->nexthdr = proto;
252 hdr->hop_limit = hlimit;
254 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
255 ipv6_addr_copy(&hdr->daddr, first_hop);
257 skb->priority = sk->sk_priority;
258 skb->mark = sk->sk_mark;
261 if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
262 IP6_INC_STATS(ip6_dst_idev(skb->dst),
263 IPSTATS_MIB_OUTREQUESTS);
264 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
269 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
271 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
272 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
277 EXPORT_SYMBOL(ip6_xmit);
280 * To avoid extra problems ND packets are send through this
281 * routine. It's code duplication but I really want to avoid
282 * extra checks since ipv6_build_header is used by TCP (which
283 * is for us performance critical)
286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
287 const struct in6_addr *saddr, const struct in6_addr *daddr,
290 struct ipv6_pinfo *np = inet6_sk(sk);
294 skb->protocol = htons(ETH_P_IPV6);
297 totlen = len + sizeof(struct ipv6hdr);
299 skb_reset_network_header(skb);
300 skb_put(skb, sizeof(struct ipv6hdr));
303 *(__be32*)hdr = htonl(0x60000000);
305 hdr->payload_len = htons(len);
306 hdr->nexthdr = proto;
307 hdr->hop_limit = np->hop_limit;
309 ipv6_addr_copy(&hdr->saddr, saddr);
310 ipv6_addr_copy(&hdr->daddr, daddr);
315 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317 struct ip6_ra_chain *ra;
318 struct sock *last = NULL;
320 read_lock(&ip6_ra_lock);
321 for (ra = ip6_ra_chain; ra; ra = ra->next) {
322 struct sock *sk = ra->sk;
323 if (sk && ra->sel == sel &&
324 (!sk->sk_bound_dev_if ||
325 sk->sk_bound_dev_if == skb->dev->ifindex)) {
327 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
329 rawv6_rcv(last, skb2);
336 rawv6_rcv(last, skb);
337 read_unlock(&ip6_ra_lock);
340 read_unlock(&ip6_ra_lock);
344 static int ip6_forward_proxy_check(struct sk_buff *skb)
346 struct ipv6hdr *hdr = ipv6_hdr(skb);
347 u8 nexthdr = hdr->nexthdr;
350 if (ipv6_ext_hdr(nexthdr)) {
351 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
355 offset = sizeof(struct ipv6hdr);
357 if (nexthdr == IPPROTO_ICMPV6) {
358 struct icmp6hdr *icmp6;
360 if (!pskb_may_pull(skb, (skb_network_header(skb) +
361 offset + 1 - skb->data)))
364 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
366 switch (icmp6->icmp6_type) {
367 case NDISC_ROUTER_SOLICITATION:
368 case NDISC_ROUTER_ADVERTISEMENT:
369 case NDISC_NEIGHBOUR_SOLICITATION:
370 case NDISC_NEIGHBOUR_ADVERTISEMENT:
372 /* For reaction involving unicast neighbor discovery
373 * message destined to the proxied address, pass it to
383 * The proxying router can't forward traffic sent to a link-local
384 * address, so signal the sender and discard the packet. This
385 * behavior is clarified by the MIPv6 specification.
387 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
388 dst_link_failure(skb);
395 static inline int ip6_forward_finish(struct sk_buff *skb)
397 return dst_output(skb);
400 int ip6_forward(struct sk_buff *skb)
402 struct dst_entry *dst = skb->dst;
403 struct ipv6hdr *hdr = ipv6_hdr(skb);
404 struct inet6_skb_parm *opt = IP6CB(skb);
405 struct net *net = dev_net(dst->dev);
407 if (ipv6_devconf.forwarding == 0)
410 if (skb_warn_if_lro(skb))
413 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
414 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
418 skb_forward_csum(skb);
421 * We DO NOT make any processing on
422 * RA packets, pushing them to user level AS IS
423 * without ane WARRANTY that application will be able
424 * to interpret them. The reason is that we
425 * cannot make anything clever here.
427 * We are not end-node, so that if packet contains
428 * AH/ESP, we cannot make anything.
429 * Defragmentation also would be mistake, RA packets
430 * cannot be fragmented, because there is no warranty
431 * that different fragments will go along one path. --ANK
434 u8 *ptr = skb_network_header(skb) + opt->ra;
435 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
440 * check and decrement ttl
442 if (hdr->hop_limit <= 1) {
443 /* Force OUTPUT device used as source address */
445 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
447 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
453 /* XXX: idev->cnf.proxy_ndp? */
454 if (ipv6_devconf.proxy_ndp &&
455 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
456 int proxied = ip6_forward_proxy_check(skb);
458 return ip6_input(skb);
459 else if (proxied < 0) {
460 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
465 if (!xfrm6_route_forward(skb)) {
466 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
471 /* IPv6 specs say nothing about it, but it is clear that we cannot
472 send redirects to source routed frames.
473 We don't send redirects to frames decapsulated from IPsec.
475 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
477 struct in6_addr *target = NULL;
479 struct neighbour *n = dst->neighbour;
482 * incoming and outgoing devices are the same
486 rt = (struct rt6_info *) dst;
487 if ((rt->rt6i_flags & RTF_GATEWAY))
488 target = (struct in6_addr*)&n->primary_key;
490 target = &hdr->daddr;
492 /* Limit redirects both by destination (here)
493 and by source (inside ndisc_send_redirect)
495 if (xrlim_allow(dst, 1*HZ))
496 ndisc_send_redirect(skb, n, target);
498 int addrtype = ipv6_addr_type(&hdr->saddr);
500 /* This check is security critical. */
501 if (addrtype == IPV6_ADDR_ANY ||
502 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
504 if (addrtype & IPV6_ADDR_LINKLOCAL) {
505 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
506 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
511 if (skb->len > dst_mtu(dst)) {
512 /* Again, force OUTPUT device used as source address */
514 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
515 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
516 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
521 if (skb_cow(skb, dst->dev->hard_header_len)) {
522 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
528 /* Mangling hops number delayed to point after skb COW */
532 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
533 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
537 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
543 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
545 to->pkt_type = from->pkt_type;
546 to->priority = from->priority;
547 to->protocol = from->protocol;
548 dst_release(to->dst);
549 to->dst = dst_clone(from->dst);
551 to->mark = from->mark;
553 #ifdef CONFIG_NET_SCHED
554 to->tc_index = from->tc_index;
557 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
558 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
559 to->nf_trace = from->nf_trace;
561 skb_copy_secmark(to, from);
564 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
566 u16 offset = sizeof(struct ipv6hdr);
567 struct ipv6_opt_hdr *exthdr =
568 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
569 unsigned int packet_len = skb->tail - skb->network_header;
571 *nexthdr = &ipv6_hdr(skb)->nexthdr;
573 while (offset + 1 <= packet_len) {
579 case NEXTHDR_ROUTING:
583 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
584 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
594 offset += ipv6_optlen(exthdr);
595 *nexthdr = &exthdr->nexthdr;
596 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
603 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
605 struct net_device *dev;
606 struct sk_buff *frag;
607 struct rt6_info *rt = (struct rt6_info*)skb->dst;
608 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
609 struct ipv6hdr *tmp_hdr;
611 unsigned int mtu, hlen, left, len;
613 int ptr, offset = 0, err=0;
614 u8 *prevhdr, nexthdr = 0;
617 hlen = ip6_find_1stfragopt(skb, &prevhdr);
620 mtu = ip6_skb_dst_mtu(skb);
622 /* We must not fragment if the socket is set to force MTU discovery
623 * or if the skb it not generated by a local socket. (This last
624 * check should be redundant, but it's free.)
626 if (!skb->local_df) {
627 skb->dev = skb->dst->dev;
628 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
629 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
634 if (np && np->frag_size < mtu) {
638 mtu -= hlen + sizeof(struct frag_hdr);
640 if (skb_shinfo(skb)->frag_list) {
641 int first_len = skb_pagelen(skb);
644 if (first_len - hlen > mtu ||
645 ((first_len - hlen) & 7) ||
649 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
650 /* Correct geometry. */
651 if (frag->len > mtu ||
652 ((frag->len & 7) && frag->next) ||
653 skb_headroom(frag) < hlen)
656 /* Partially cloned skb? */
657 if (skb_shared(frag))
664 frag->destructor = sock_wfree;
665 truesizes += frag->truesize;
671 frag = skb_shinfo(skb)->frag_list;
672 skb_shinfo(skb)->frag_list = NULL;
675 *prevhdr = NEXTHDR_FRAGMENT;
676 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
678 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
682 __skb_pull(skb, hlen);
683 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
684 __skb_push(skb, hlen);
685 skb_reset_network_header(skb);
686 memcpy(skb_network_header(skb), tmp_hdr, hlen);
688 ipv6_select_ident(skb, fh);
689 fh->nexthdr = nexthdr;
691 fh->frag_off = htons(IP6_MF);
692 frag_id = fh->identification;
694 first_len = skb_pagelen(skb);
695 skb->data_len = first_len - skb_headlen(skb);
696 skb->truesize -= truesizes;
697 skb->len = first_len;
698 ipv6_hdr(skb)->payload_len = htons(first_len -
699 sizeof(struct ipv6hdr));
701 dst_hold(&rt->u.dst);
704 /* Prepare header of the next frame,
705 * before previous one went down. */
707 frag->ip_summed = CHECKSUM_NONE;
708 skb_reset_transport_header(frag);
709 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
710 __skb_push(frag, hlen);
711 skb_reset_network_header(frag);
712 memcpy(skb_network_header(frag), tmp_hdr,
714 offset += skb->len - hlen - sizeof(struct frag_hdr);
715 fh->nexthdr = nexthdr;
717 fh->frag_off = htons(offset);
718 if (frag->next != NULL)
719 fh->frag_off |= htons(IP6_MF);
720 fh->identification = frag_id;
721 ipv6_hdr(frag)->payload_len =
723 sizeof(struct ipv6hdr));
724 ip6_copy_metadata(frag, skb);
729 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
742 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
743 dst_release(&rt->u.dst);
753 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
754 dst_release(&rt->u.dst);
759 left = skb->len - hlen; /* Space per frame */
760 ptr = hlen; /* Where to start from */
763 * Fragment the datagram.
766 *prevhdr = NEXTHDR_FRAGMENT;
769 * Keep copying data until we run out.
773 /* IF: it doesn't fit, use 'mtu' - the data space left */
776 /* IF: we are not sending upto and including the packet end
777 then align the next start on an eight byte boundary */
785 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
786 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
787 IP6_INC_STATS(ip6_dst_idev(skb->dst),
788 IPSTATS_MIB_FRAGFAILS);
794 * Set up data on packet
797 ip6_copy_metadata(frag, skb);
798 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
799 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
800 skb_reset_network_header(frag);
801 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
802 frag->transport_header = (frag->network_header + hlen +
803 sizeof(struct frag_hdr));
806 * Charge the memory for the fragment to any owner
810 skb_set_owner_w(frag, skb->sk);
813 * Copy the packet header into the new buffer.
815 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
818 * Build fragment header.
820 fh->nexthdr = nexthdr;
823 ipv6_select_ident(skb, fh);
824 frag_id = fh->identification;
826 fh->identification = frag_id;
829 * Copy a block of the IP datagram.
831 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
835 fh->frag_off = htons(offset);
837 fh->frag_off |= htons(IP6_MF);
838 ipv6_hdr(frag)->payload_len = htons(frag->len -
839 sizeof(struct ipv6hdr));
845 * Put this fragment into the sending queue.
851 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
853 IP6_INC_STATS(ip6_dst_idev(skb->dst),
854 IPSTATS_MIB_FRAGOKS);
859 IP6_INC_STATS(ip6_dst_idev(skb->dst),
860 IPSTATS_MIB_FRAGFAILS);
865 static inline int ip6_rt_check(struct rt6key *rt_key,
866 struct in6_addr *fl_addr,
867 struct in6_addr *addr_cache)
869 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
870 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
873 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
874 struct dst_entry *dst,
877 struct ipv6_pinfo *np = inet6_sk(sk);
878 struct rt6_info *rt = (struct rt6_info *)dst;
883 /* Yes, checking route validity in not connected
884 * case is not very simple. Take into account,
885 * that we do not support routing by source, TOS,
886 * and MSG_DONTROUTE --ANK (980726)
888 * 1. ip6_rt_check(): If route was host route,
889 * check that cached destination is current.
890 * If it is network route, we still may
891 * check its validity using saved pointer
892 * to the last used address: daddr_cache.
893 * We do not want to save whole address now,
894 * (because main consumer of this service
895 * is tcp, which has not this problem),
896 * so that the last trick works only on connected
898 * 2. oif also should be the same.
900 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
901 #ifdef CONFIG_IPV6_SUBTREES
902 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
904 (fl->oif && fl->oif != dst->dev->ifindex)) {
913 static int ip6_dst_lookup_tail(struct sock *sk,
914 struct dst_entry **dst, struct flowi *fl)
917 struct net *net = sock_net(sk);
920 *dst = ip6_route_output(net, sk, fl);
922 if ((err = (*dst)->error))
923 goto out_err_release;
925 if (ipv6_addr_any(&fl->fl6_src)) {
926 err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
928 sk ? inet6_sk(sk)->srcprefs : 0,
931 goto out_err_release;
934 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
936 * Here if the dst entry we've looked up
937 * has a neighbour entry that is in the INCOMPLETE
938 * state and the src address from the flow is
939 * marked as OPTIMISTIC, we release the found
940 * dst entry and replace it instead with the
941 * dst entry of the nexthop router
943 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
944 struct inet6_ifaddr *ifp;
948 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
951 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
957 * We need to get the dst entry for the
958 * default router instead
961 memcpy(&fl_gw, fl, sizeof(struct flowi));
962 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
963 *dst = ip6_route_output(net, sk, &fl_gw);
964 if ((err = (*dst)->error))
965 goto out_err_release;
973 if (err == -ENETUNREACH)
974 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
981 * ip6_dst_lookup - perform route lookup on flow
982 * @sk: socket which provides route info
983 * @dst: pointer to dst_entry * for result
984 * @fl: flow to lookup
986 * This function performs a route lookup on the given flow.
988 * It returns zero on success, or a standard errno code on error.
990 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
993 return ip6_dst_lookup_tail(sk, dst, fl);
995 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
998 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
999 * @sk: socket which provides the dst cache and route info
1000 * @dst: pointer to dst_entry * for result
1001 * @fl: flow to lookup
1003 * This function performs a route lookup on the given flow with the
1004 * possibility of using the cached route in the socket if it is valid.
1005 * It will take the socket dst lock when operating on the dst cache.
1006 * As a result, this function can only be used in process context.
1008 * It returns zero on success, or a standard errno code on error.
1010 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1014 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1015 *dst = ip6_sk_dst_check(sk, *dst, fl);
1018 return ip6_dst_lookup_tail(sk, dst, fl);
1020 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1022 static inline int ip6_ufo_append_data(struct sock *sk,
1023 int getfrag(void *from, char *to, int offset, int len,
1024 int odd, struct sk_buff *skb),
1025 void *from, int length, int hh_len, int fragheaderlen,
1026 int transhdrlen, int mtu,unsigned int flags)
1029 struct sk_buff *skb;
1032 /* There is support for UDP large send offload by network
1033 * device, so create one single skb packet containing complete
1036 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1037 skb = sock_alloc_send_skb(sk,
1038 hh_len + fragheaderlen + transhdrlen + 20,
1039 (flags & MSG_DONTWAIT), &err);
1043 /* reserve space for Hardware header */
1044 skb_reserve(skb, hh_len);
1046 /* create space for UDP/IP header */
1047 skb_put(skb,fragheaderlen + transhdrlen);
1049 /* initialize network header pointer */
1050 skb_reset_network_header(skb);
1052 /* initialize protocol header pointer */
1053 skb->transport_header = skb->network_header + fragheaderlen;
1055 skb->ip_summed = CHECKSUM_PARTIAL;
1057 sk->sk_sndmsg_off = 0;
1060 err = skb_append_datato_frags(sk,skb, getfrag, from,
1061 (length - transhdrlen));
1063 struct frag_hdr fhdr;
1065 /* specify the length of each IP datagram fragment*/
1066 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1067 sizeof(struct frag_hdr);
1068 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1069 ipv6_select_ident(skb, &fhdr);
1070 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1071 __skb_queue_tail(&sk->sk_write_queue, skb);
1075 /* There is not enough support do UPD LSO,
1076 * so follow normal path
1083 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1084 int offset, int len, int odd, struct sk_buff *skb),
1085 void *from, int length, int transhdrlen,
1086 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1087 struct rt6_info *rt, unsigned int flags)
1089 struct inet_sock *inet = inet_sk(sk);
1090 struct ipv6_pinfo *np = inet6_sk(sk);
1091 struct sk_buff *skb;
1092 unsigned int maxfraglen, fragheaderlen;
1099 int csummode = CHECKSUM_NONE;
1101 if (flags&MSG_PROBE)
1103 if (skb_queue_empty(&sk->sk_write_queue)) {
1108 if (np->cork.opt == NULL) {
1109 np->cork.opt = kmalloc(opt->tot_len,
1111 if (unlikely(np->cork.opt == NULL))
1113 } else if (np->cork.opt->tot_len < opt->tot_len) {
1114 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1117 memcpy(np->cork.opt, opt, opt->tot_len);
1118 inet->cork.flags |= IPCORK_OPT;
1119 /* need source address above miyazawa*/
1121 dst_hold(&rt->u.dst);
1122 inet->cork.dst = &rt->u.dst;
1123 inet->cork.fl = *fl;
1124 np->cork.hop_limit = hlimit;
1125 np->cork.tclass = tclass;
1126 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1127 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1128 if (np->frag_size < mtu) {
1130 mtu = np->frag_size;
1132 inet->cork.fragsize = mtu;
1133 if (dst_allfrag(rt->u.dst.path))
1134 inet->cork.flags |= IPCORK_ALLFRAG;
1135 inet->cork.length = 0;
1136 sk->sk_sndmsg_page = NULL;
1137 sk->sk_sndmsg_off = 0;
1138 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1139 rt->rt6i_nfheader_len;
1140 length += exthdrlen;
1141 transhdrlen += exthdrlen;
1143 rt = (struct rt6_info *)inet->cork.dst;
1144 fl = &inet->cork.fl;
1145 if (inet->cork.flags & IPCORK_OPT)
1149 mtu = inet->cork.fragsize;
1152 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1154 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1155 (opt ? opt->opt_nflen : 0);
1156 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1158 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1159 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1160 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1166 * Let's try using as much space as possible.
1167 * Use MTU if total length of the message fits into the MTU.
1168 * Otherwise, we need to reserve fragment header and
1169 * fragment alignment (= 8-15 octects, in total).
1171 * Note that we may need to "move" the data from the tail of
1172 * of the buffer to the new fragment when we split
1175 * FIXME: It may be fragmented into multiple chunks
1176 * at once if non-fragmentable extension headers
1181 inet->cork.length += length;
1182 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1183 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1185 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1186 fragheaderlen, transhdrlen, mtu,
1193 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1196 while (length > 0) {
1197 /* Check if the remaining data fits into current packet. */
1198 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1200 copy = maxfraglen - skb->len;
1204 unsigned int datalen;
1205 unsigned int fraglen;
1206 unsigned int fraggap;
1207 unsigned int alloclen;
1208 struct sk_buff *skb_prev;
1212 /* There's no room in the current skb */
1214 fraggap = skb_prev->len - maxfraglen;
1219 * If remaining data exceeds the mtu,
1220 * we know we need more fragment(s).
1222 datalen = length + fraggap;
1223 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1224 datalen = maxfraglen - fragheaderlen;
1226 fraglen = datalen + fragheaderlen;
1227 if ((flags & MSG_MORE) &&
1228 !(rt->u.dst.dev->features&NETIF_F_SG))
1231 alloclen = datalen + fragheaderlen;
1234 * The last fragment gets additional space at tail.
1235 * Note: we overallocate on fragments with MSG_MODE
1236 * because we have no idea if we're the last one.
1238 if (datalen == length + fraggap)
1239 alloclen += rt->u.dst.trailer_len;
1242 * We just reserve space for fragment header.
1243 * Note: this may be overallocation if the message
1244 * (without MSG_MORE) fits into the MTU.
1246 alloclen += sizeof(struct frag_hdr);
1249 skb = sock_alloc_send_skb(sk,
1251 (flags & MSG_DONTWAIT), &err);
1254 if (atomic_read(&sk->sk_wmem_alloc) <=
1256 skb = sock_wmalloc(sk,
1257 alloclen + hh_len, 1,
1259 if (unlikely(skb == NULL))
1265 * Fill in the control structures
1267 skb->ip_summed = csummode;
1269 /* reserve for fragmentation */
1270 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1273 * Find where to start putting bytes
1275 data = skb_put(skb, fraglen);
1276 skb_set_network_header(skb, exthdrlen);
1277 data += fragheaderlen;
1278 skb->transport_header = (skb->network_header +
1281 skb->csum = skb_copy_and_csum_bits(
1282 skb_prev, maxfraglen,
1283 data + transhdrlen, fraggap, 0);
1284 skb_prev->csum = csum_sub(skb_prev->csum,
1287 pskb_trim_unique(skb_prev, maxfraglen);
1289 copy = datalen - transhdrlen - fraggap;
1294 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1301 length -= datalen - fraggap;
1304 csummode = CHECKSUM_NONE;
1307 * Put the packet on the pending queue
1309 __skb_queue_tail(&sk->sk_write_queue, skb);
1316 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1320 if (getfrag(from, skb_put(skb, copy),
1321 offset, copy, off, skb) < 0) {
1322 __skb_trim(skb, off);
1327 int i = skb_shinfo(skb)->nr_frags;
1328 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1329 struct page *page = sk->sk_sndmsg_page;
1330 int off = sk->sk_sndmsg_off;
1333 if (page && (left = PAGE_SIZE - off) > 0) {
1336 if (page != frag->page) {
1337 if (i == MAX_SKB_FRAGS) {
1342 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1343 frag = &skb_shinfo(skb)->frags[i];
1345 } else if(i < MAX_SKB_FRAGS) {
1346 if (copy > PAGE_SIZE)
1348 page = alloc_pages(sk->sk_allocation, 0);
1353 sk->sk_sndmsg_page = page;
1354 sk->sk_sndmsg_off = 0;
1356 skb_fill_page_desc(skb, i, page, 0, 0);
1357 frag = &skb_shinfo(skb)->frags[i];
1362 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1366 sk->sk_sndmsg_off += copy;
1369 skb->data_len += copy;
1370 skb->truesize += copy;
1371 atomic_add(copy, &sk->sk_wmem_alloc);
1378 inet->cork.length -= length;
1379 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1383 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1385 inet->cork.flags &= ~IPCORK_OPT;
1386 kfree(np->cork.opt);
1387 np->cork.opt = NULL;
1388 if (inet->cork.dst) {
1389 dst_release(inet->cork.dst);
1390 inet->cork.dst = NULL;
1391 inet->cork.flags &= ~IPCORK_ALLFRAG;
1393 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1396 int ip6_push_pending_frames(struct sock *sk)
1398 struct sk_buff *skb, *tmp_skb;
1399 struct sk_buff **tail_skb;
1400 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1401 struct inet_sock *inet = inet_sk(sk);
1402 struct ipv6_pinfo *np = inet6_sk(sk);
1403 struct ipv6hdr *hdr;
1404 struct ipv6_txoptions *opt = np->cork.opt;
1405 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1406 struct flowi *fl = &inet->cork.fl;
1407 unsigned char proto = fl->proto;
1410 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1412 tail_skb = &(skb_shinfo(skb)->frag_list);
1414 /* move skb->data to ip header from ext header */
1415 if (skb->data < skb_network_header(skb))
1416 __skb_pull(skb, skb_network_offset(skb));
1417 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1418 __skb_pull(tmp_skb, skb_network_header_len(skb));
1419 *tail_skb = tmp_skb;
1420 tail_skb = &(tmp_skb->next);
1421 skb->len += tmp_skb->len;
1422 skb->data_len += tmp_skb->len;
1423 skb->truesize += tmp_skb->truesize;
1424 __sock_put(tmp_skb->sk);
1425 tmp_skb->destructor = NULL;
1429 /* Allow local fragmentation. */
1430 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1433 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1434 __skb_pull(skb, skb_network_header_len(skb));
1435 if (opt && opt->opt_flen)
1436 ipv6_push_frag_opts(skb, opt, &proto);
1437 if (opt && opt->opt_nflen)
1438 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1440 skb_push(skb, sizeof(struct ipv6hdr));
1441 skb_reset_network_header(skb);
1442 hdr = ipv6_hdr(skb);
1444 *(__be32*)hdr = fl->fl6_flowlabel |
1445 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1447 hdr->hop_limit = np->cork.hop_limit;
1448 hdr->nexthdr = proto;
1449 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1450 ipv6_addr_copy(&hdr->daddr, final_dst);
1452 skb->priority = sk->sk_priority;
1453 skb->mark = sk->sk_mark;
1455 skb->dst = dst_clone(&rt->u.dst);
1456 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1457 if (proto == IPPROTO_ICMPV6) {
1458 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1460 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1461 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1464 err = ip6_local_out(skb);
1467 err = np->recverr ? net_xmit_errno(err) : 0;
1473 ip6_cork_release(inet, np);
1479 void ip6_flush_pending_frames(struct sock *sk)
1481 struct sk_buff *skb;
1483 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1485 IP6_INC_STATS(ip6_dst_idev(skb->dst),
1486 IPSTATS_MIB_OUTDISCARDS);
1490 ip6_cork_release(inet_sk(sk), inet6_sk(sk));