2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 static int ip6_finish_output2(struct sk_buff *skb)
61 struct dst_entry *dst = skb_dst(skb);
62 struct net_device *dev = dst->dev;
63 struct neighbour *neigh;
64 struct in6_addr *nexthop;
67 skb->protocol = htons(ETH_P_IPV6);
70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
74 ((mroute6_socket(dev_net(dev), skb) &&
75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 newskb, NULL, newskb->dev,
88 if (ipv6_hdr(skb)->hop_limit == 0) {
89 IP6_INC_STATS(dev_net(dev), idev,
90 IPSTATS_MIB_OUTDISCARDS);
96 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
99 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 IPV6_ADDR_SCOPE_NODELOCAL &&
101 !(dev->flags & IFF_LOOPBACK)) {
108 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 if (unlikely(!neigh))
111 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 if (!IS_ERR(neigh)) {
113 ret = dst_neigh_output(dst, neigh, skb);
114 rcu_read_unlock_bh();
117 rcu_read_unlock_bh();
119 IP6_INC_STATS_BH(dev_net(dst->dev),
120 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
125 static int ip6_finish_output(struct sk_buff *skb)
127 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 dst_allfrag(skb_dst(skb)))
129 return ip6_fragment(skb, ip6_finish_output2);
131 return ip6_finish_output2(skb);
134 int ip6_output(struct sk_buff *skb)
136 struct net_device *dev = skb_dst(skb)->dev;
137 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
138 if (unlikely(idev->cnf.disable_ipv6)) {
139 IP6_INC_STATS(dev_net(dev), idev,
140 IPSTATS_MIB_OUTDISCARDS);
145 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
147 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
151 * xmit an sk_buff (used by TCP, SCTP and DCCP)
154 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
155 struct ipv6_txoptions *opt, int tclass)
157 struct net *net = sock_net(sk);
158 struct ipv6_pinfo *np = inet6_sk(sk);
159 struct in6_addr *first_hop = &fl6->daddr;
160 struct dst_entry *dst = skb_dst(skb);
162 u8 proto = fl6->flowi6_proto;
163 int seg_len = skb->len;
168 unsigned int head_room;
170 /* First: exthdrs may take lots of space (~8K for now)
171 MAX_HEADER is not enough.
173 head_room = opt->opt_nflen + opt->opt_flen;
174 seg_len += head_room;
175 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
177 if (skb_headroom(skb) < head_room) {
178 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
180 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
181 IPSTATS_MIB_OUTDISCARDS);
187 skb_set_owner_w(skb, sk);
190 ipv6_push_frag_opts(skb, opt, &proto);
192 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195 skb_push(skb, sizeof(struct ipv6hdr));
196 skb_reset_network_header(skb);
200 * Fill in the IPv6 header
203 hlimit = np->hop_limit;
205 hlimit = ip6_dst_hoplimit(dst);
207 ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
209 hdr->payload_len = htons(seg_len);
210 hdr->nexthdr = proto;
211 hdr->hop_limit = hlimit;
213 hdr->saddr = fl6->saddr;
214 hdr->daddr = *first_hop;
216 skb->priority = sk->sk_priority;
217 skb->mark = sk->sk_mark;
220 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
221 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
222 IPSTATS_MIB_OUT, skb->len);
223 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
224 dst->dev, dst_output);
228 ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
229 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234 EXPORT_SYMBOL(ip6_xmit);
236 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
238 struct ip6_ra_chain *ra;
239 struct sock *last = NULL;
241 read_lock(&ip6_ra_lock);
242 for (ra = ip6_ra_chain; ra; ra = ra->next) {
243 struct sock *sk = ra->sk;
244 if (sk && ra->sel == sel &&
245 (!sk->sk_bound_dev_if ||
246 sk->sk_bound_dev_if == skb->dev->ifindex)) {
248 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
250 rawv6_rcv(last, skb2);
257 rawv6_rcv(last, skb);
258 read_unlock(&ip6_ra_lock);
261 read_unlock(&ip6_ra_lock);
265 static int ip6_forward_proxy_check(struct sk_buff *skb)
267 struct ipv6hdr *hdr = ipv6_hdr(skb);
268 u8 nexthdr = hdr->nexthdr;
272 if (ipv6_ext_hdr(nexthdr)) {
273 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277 offset = sizeof(struct ipv6hdr);
279 if (nexthdr == IPPROTO_ICMPV6) {
280 struct icmp6hdr *icmp6;
282 if (!pskb_may_pull(skb, (skb_network_header(skb) +
283 offset + 1 - skb->data)))
286 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
288 switch (icmp6->icmp6_type) {
289 case NDISC_ROUTER_SOLICITATION:
290 case NDISC_ROUTER_ADVERTISEMENT:
291 case NDISC_NEIGHBOUR_SOLICITATION:
292 case NDISC_NEIGHBOUR_ADVERTISEMENT:
294 /* For reaction involving unicast neighbor discovery
295 * message destined to the proxied address, pass it to
305 * The proxying router can't forward traffic sent to a link-local
306 * address, so signal the sender and discard the packet. This
307 * behavior is clarified by the MIPv6 specification.
309 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
310 dst_link_failure(skb);
317 static inline int ip6_forward_finish(struct sk_buff *skb)
319 return dst_output(skb);
322 int ip6_forward(struct sk_buff *skb)
324 struct dst_entry *dst = skb_dst(skb);
325 struct ipv6hdr *hdr = ipv6_hdr(skb);
326 struct inet6_skb_parm *opt = IP6CB(skb);
327 struct net *net = dev_net(dst->dev);
330 if (net->ipv6.devconf_all->forwarding == 0)
333 if (skb_warn_if_lro(skb))
336 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
337 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
341 if (skb->pkt_type != PACKET_HOST)
344 skb_forward_csum(skb);
347 * We DO NOT make any processing on
348 * RA packets, pushing them to user level AS IS
349 * without ane WARRANTY that application will be able
350 * to interpret them. The reason is that we
351 * cannot make anything clever here.
353 * We are not end-node, so that if packet contains
354 * AH/ESP, we cannot make anything.
355 * Defragmentation also would be mistake, RA packets
356 * cannot be fragmented, because there is no warranty
357 * that different fragments will go along one path. --ANK
359 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
360 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
365 * check and decrement ttl
367 if (hdr->hop_limit <= 1) {
368 /* Force OUTPUT device used as source address */
370 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
371 IP6_INC_STATS_BH(net,
372 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
378 /* XXX: idev->cnf.proxy_ndp? */
379 if (net->ipv6.devconf_all->proxy_ndp &&
380 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
381 int proxied = ip6_forward_proxy_check(skb);
383 return ip6_input(skb);
384 else if (proxied < 0) {
385 IP6_INC_STATS(net, ip6_dst_idev(dst),
386 IPSTATS_MIB_INDISCARDS);
391 if (!xfrm6_route_forward(skb)) {
392 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
397 /* IPv6 specs say nothing about it, but it is clear that we cannot
398 send redirects to source routed frames.
399 We don't send redirects to frames decapsulated from IPsec.
401 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
402 struct in6_addr *target = NULL;
403 struct inet_peer *peer;
407 * incoming and outgoing devices are the same
411 rt = (struct rt6_info *) dst;
412 if (rt->rt6i_flags & RTF_GATEWAY)
413 target = &rt->rt6i_gateway;
415 target = &hdr->daddr;
417 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
419 /* Limit redirects both by destination (here)
420 and by source (inside ndisc_send_redirect)
422 if (inet_peer_xrlim_allow(peer, 1*HZ))
423 ndisc_send_redirect(skb, target);
427 int addrtype = ipv6_addr_type(&hdr->saddr);
429 /* This check is security critical. */
430 if (addrtype == IPV6_ADDR_ANY ||
431 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
433 if (addrtype & IPV6_ADDR_LINKLOCAL) {
434 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
435 ICMPV6_NOT_NEIGHBOUR, 0);
441 if (mtu < IPV6_MIN_MTU)
444 if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
445 (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
446 /* Again, force OUTPUT device used as source address */
448 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
449 IP6_INC_STATS_BH(net,
450 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
451 IP6_INC_STATS_BH(net,
452 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
457 if (skb_cow(skb, dst->dev->hard_header_len)) {
458 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
464 /* Mangling hops number delayed to point after skb COW */
468 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
469 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
470 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
474 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
480 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
482 to->pkt_type = from->pkt_type;
483 to->priority = from->priority;
484 to->protocol = from->protocol;
486 skb_dst_set(to, dst_clone(skb_dst(from)));
488 to->mark = from->mark;
490 #ifdef CONFIG_NET_SCHED
491 to->tc_index = from->tc_index;
494 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
495 to->nf_trace = from->nf_trace;
497 skb_copy_secmark(to, from);
500 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
502 struct sk_buff *frag;
503 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
504 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
505 struct ipv6hdr *tmp_hdr;
507 unsigned int mtu, hlen, left, len;
510 int ptr, offset = 0, err=0;
511 u8 *prevhdr, nexthdr = 0;
512 struct net *net = dev_net(skb_dst(skb)->dev);
514 hlen = ip6_find_1stfragopt(skb, &prevhdr);
517 mtu = ip6_skb_dst_mtu(skb);
519 /* We must not fragment if the socket is set to force MTU discovery
520 * or if the skb it not generated by a local socket.
522 if (unlikely(!skb->local_df && skb->len > mtu) ||
523 (IP6CB(skb)->frag_max_size &&
524 IP6CB(skb)->frag_max_size > mtu)) {
525 if (skb->sk && dst_allfrag(skb_dst(skb)))
526 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
528 skb->dev = skb_dst(skb)->dev;
529 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
530 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
531 IPSTATS_MIB_FRAGFAILS);
536 if (np && np->frag_size < mtu) {
540 mtu -= hlen + sizeof(struct frag_hdr);
542 if (skb_has_frag_list(skb)) {
543 int first_len = skb_pagelen(skb);
544 struct sk_buff *frag2;
546 if (first_len - hlen > mtu ||
547 ((first_len - hlen) & 7) ||
551 skb_walk_frags(skb, frag) {
552 /* Correct geometry. */
553 if (frag->len > mtu ||
554 ((frag->len & 7) && frag->next) ||
555 skb_headroom(frag) < hlen)
556 goto slow_path_clean;
558 /* Partially cloned skb? */
559 if (skb_shared(frag))
560 goto slow_path_clean;
565 frag->destructor = sock_wfree;
567 skb->truesize -= frag->truesize;
572 frag = skb_shinfo(skb)->frag_list;
573 skb_frag_list_init(skb);
576 *prevhdr = NEXTHDR_FRAGMENT;
577 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
579 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
580 IPSTATS_MIB_FRAGFAILS);
584 __skb_pull(skb, hlen);
585 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
586 __skb_push(skb, hlen);
587 skb_reset_network_header(skb);
588 memcpy(skb_network_header(skb), tmp_hdr, hlen);
590 ipv6_select_ident(fh, rt);
591 fh->nexthdr = nexthdr;
593 fh->frag_off = htons(IP6_MF);
594 frag_id = fh->identification;
596 first_len = skb_pagelen(skb);
597 skb->data_len = first_len - skb_headlen(skb);
598 skb->len = first_len;
599 ipv6_hdr(skb)->payload_len = htons(first_len -
600 sizeof(struct ipv6hdr));
605 /* Prepare header of the next frame,
606 * before previous one went down. */
608 frag->ip_summed = CHECKSUM_NONE;
609 skb_reset_transport_header(frag);
610 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
611 __skb_push(frag, hlen);
612 skb_reset_network_header(frag);
613 memcpy(skb_network_header(frag), tmp_hdr,
615 offset += skb->len - hlen - sizeof(struct frag_hdr);
616 fh->nexthdr = nexthdr;
618 fh->frag_off = htons(offset);
619 if (frag->next != NULL)
620 fh->frag_off |= htons(IP6_MF);
621 fh->identification = frag_id;
622 ipv6_hdr(frag)->payload_len =
624 sizeof(struct ipv6hdr));
625 ip6_copy_metadata(frag, skb);
630 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
631 IPSTATS_MIB_FRAGCREATES);
644 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
645 IPSTATS_MIB_FRAGOKS);
656 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
657 IPSTATS_MIB_FRAGFAILS);
662 skb_walk_frags(skb, frag2) {
666 frag2->destructor = NULL;
667 skb->truesize += frag2->truesize;
672 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
673 skb_checksum_help(skb))
676 left = skb->len - hlen; /* Space per frame */
677 ptr = hlen; /* Where to start from */
680 * Fragment the datagram.
683 *prevhdr = NEXTHDR_FRAGMENT;
684 hroom = LL_RESERVED_SPACE(rt->dst.dev);
685 troom = rt->dst.dev->needed_tailroom;
688 * Keep copying data until we run out.
692 /* IF: it doesn't fit, use 'mtu' - the data space left */
695 /* IF: we are not sending up to and including the packet end
696 then align the next start on an eight byte boundary */
704 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
705 hroom + troom, GFP_ATOMIC)) == NULL) {
706 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
707 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
708 IPSTATS_MIB_FRAGFAILS);
714 * Set up data on packet
717 ip6_copy_metadata(frag, skb);
718 skb_reserve(frag, hroom);
719 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
720 skb_reset_network_header(frag);
721 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
722 frag->transport_header = (frag->network_header + hlen +
723 sizeof(struct frag_hdr));
726 * Charge the memory for the fragment to any owner
730 skb_set_owner_w(frag, skb->sk);
733 * Copy the packet header into the new buffer.
735 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
738 * Build fragment header.
740 fh->nexthdr = nexthdr;
743 ipv6_select_ident(fh, rt);
744 frag_id = fh->identification;
746 fh->identification = frag_id;
749 * Copy a block of the IP datagram.
751 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
755 fh->frag_off = htons(offset);
757 fh->frag_off |= htons(IP6_MF);
758 ipv6_hdr(frag)->payload_len = htons(frag->len -
759 sizeof(struct ipv6hdr));
765 * Put this fragment into the sending queue.
771 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
772 IPSTATS_MIB_FRAGCREATES);
774 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
775 IPSTATS_MIB_FRAGOKS);
780 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
781 IPSTATS_MIB_FRAGFAILS);
786 static inline int ip6_rt_check(const struct rt6key *rt_key,
787 const struct in6_addr *fl_addr,
788 const struct in6_addr *addr_cache)
790 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
791 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
794 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
795 struct dst_entry *dst,
796 const struct flowi6 *fl6)
798 struct ipv6_pinfo *np = inet6_sk(sk);
804 if (dst->ops->family != AF_INET6) {
809 rt = (struct rt6_info *)dst;
810 /* Yes, checking route validity in not connected
811 * case is not very simple. Take into account,
812 * that we do not support routing by source, TOS,
813 * and MSG_DONTROUTE --ANK (980726)
815 * 1. ip6_rt_check(): If route was host route,
816 * check that cached destination is current.
817 * If it is network route, we still may
818 * check its validity using saved pointer
819 * to the last used address: daddr_cache.
820 * We do not want to save whole address now,
821 * (because main consumer of this service
822 * is tcp, which has not this problem),
823 * so that the last trick works only on connected
825 * 2. oif also should be the same.
827 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
828 #ifdef CONFIG_IPV6_SUBTREES
829 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
831 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
840 static int ip6_dst_lookup_tail(struct sock *sk,
841 struct dst_entry **dst, struct flowi6 *fl6)
843 struct net *net = sock_net(sk);
844 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
851 *dst = ip6_route_output(net, sk, fl6);
853 if ((err = (*dst)->error))
854 goto out_err_release;
856 if (ipv6_addr_any(&fl6->saddr)) {
857 struct rt6_info *rt = (struct rt6_info *) *dst;
858 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
859 sk ? inet6_sk(sk)->srcprefs : 0,
862 goto out_err_release;
865 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
867 * Here if the dst entry we've looked up
868 * has a neighbour entry that is in the INCOMPLETE
869 * state and the src address from the flow is
870 * marked as OPTIMISTIC, we release the found
871 * dst entry and replace it instead with the
872 * dst entry of the nexthop router
874 rt = (struct rt6_info *) *dst;
876 n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
877 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
878 rcu_read_unlock_bh();
881 struct inet6_ifaddr *ifp;
882 struct flowi6 fl_gw6;
885 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
888 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
894 * We need to get the dst entry for the
895 * default router instead
898 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
899 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
900 *dst = ip6_route_output(net, sk, &fl_gw6);
901 if ((err = (*dst)->error))
902 goto out_err_release;
910 if (err == -ENETUNREACH)
911 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
918 * ip6_dst_lookup - perform route lookup on flow
919 * @sk: socket which provides route info
920 * @dst: pointer to dst_entry * for result
921 * @fl6: flow to lookup
923 * This function performs a route lookup on the given flow.
925 * It returns zero on success, or a standard errno code on error.
927 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
930 return ip6_dst_lookup_tail(sk, dst, fl6);
932 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
935 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
936 * @sk: socket which provides route info
937 * @fl6: flow to lookup
938 * @final_dst: final destination address for ipsec lookup
939 * @can_sleep: we are in a sleepable context
941 * This function performs a route lookup on the given flow.
943 * It returns a valid dst pointer on success, or a pointer encoded
946 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
947 const struct in6_addr *final_dst,
950 struct dst_entry *dst = NULL;
953 err = ip6_dst_lookup_tail(sk, &dst, fl6);
957 fl6->daddr = *final_dst;
959 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
961 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
963 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
966 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
967 * @sk: socket which provides the dst cache and route info
968 * @fl6: flow to lookup
969 * @final_dst: final destination address for ipsec lookup
970 * @can_sleep: we are in a sleepable context
972 * This function performs a route lookup on the given flow with the
973 * possibility of using the cached route in the socket if it is valid.
974 * It will take the socket dst lock when operating on the dst cache.
975 * As a result, this function can only be used in process context.
977 * It returns a valid dst pointer on success, or a pointer encoded
980 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
981 const struct in6_addr *final_dst,
984 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
987 dst = ip6_sk_dst_check(sk, dst, fl6);
989 err = ip6_dst_lookup_tail(sk, &dst, fl6);
993 fl6->daddr = *final_dst;
995 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
997 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
999 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1001 static inline int ip6_ufo_append_data(struct sock *sk,
1002 int getfrag(void *from, char *to, int offset, int len,
1003 int odd, struct sk_buff *skb),
1004 void *from, int length, int hh_len, int fragheaderlen,
1005 int transhdrlen, int mtu,unsigned int flags,
1006 struct rt6_info *rt)
1009 struct sk_buff *skb;
1012 /* There is support for UDP large send offload by network
1013 * device, so create one single skb packet containing complete
1016 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1017 skb = sock_alloc_send_skb(sk,
1018 hh_len + fragheaderlen + transhdrlen + 20,
1019 (flags & MSG_DONTWAIT), &err);
1023 /* reserve space for Hardware header */
1024 skb_reserve(skb, hh_len);
1026 /* create space for UDP/IP header */
1027 skb_put(skb,fragheaderlen + transhdrlen);
1029 /* initialize network header pointer */
1030 skb_reset_network_header(skb);
1032 /* initialize protocol header pointer */
1033 skb->transport_header = skb->network_header + fragheaderlen;
1035 skb->ip_summed = CHECKSUM_PARTIAL;
1039 err = skb_append_datato_frags(sk,skb, getfrag, from,
1040 (length - transhdrlen));
1042 struct frag_hdr fhdr;
1044 /* Specify the length of each IPv6 datagram fragment.
1045 * It has to be a multiple of 8.
1047 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1048 sizeof(struct frag_hdr)) & ~7;
1049 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1050 ipv6_select_ident(&fhdr, rt);
1051 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1052 __skb_queue_tail(&sk->sk_write_queue, skb);
1056 /* There is not enough support do UPD LSO,
1057 * so follow normal path
1064 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1067 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1070 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1073 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1076 static void ip6_append_data_mtu(unsigned int *mtu,
1078 unsigned int fragheaderlen,
1079 struct sk_buff *skb,
1080 struct rt6_info *rt,
1083 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1085 /* first fragment, reserve header_len */
1086 *mtu = *mtu - rt->dst.header_len;
1090 * this fragment is not first, the headers
1091 * space is regarded as data space.
1093 *mtu = min(*mtu, pmtuprobe ?
1095 dst_mtu(rt->dst.path));
1097 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1098 + fragheaderlen - sizeof(struct frag_hdr);
1102 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1103 int offset, int len, int odd, struct sk_buff *skb),
1104 void *from, int length, int transhdrlen,
1105 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1106 struct rt6_info *rt, unsigned int flags, int dontfrag)
1108 struct inet_sock *inet = inet_sk(sk);
1109 struct ipv6_pinfo *np = inet6_sk(sk);
1110 struct inet_cork *cork;
1111 struct sk_buff *skb, *skb_prev = NULL;
1112 unsigned int maxfraglen, fragheaderlen, mtu;
1121 if (flags&MSG_PROBE)
1123 cork = &inet->cork.base;
1124 if (skb_queue_empty(&sk->sk_write_queue)) {
1129 if (WARN_ON(np->cork.opt))
1132 np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1133 if (unlikely(np->cork.opt == NULL))
1136 np->cork.opt->tot_len = opt->tot_len;
1137 np->cork.opt->opt_flen = opt->opt_flen;
1138 np->cork.opt->opt_nflen = opt->opt_nflen;
1140 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1142 if (opt->dst0opt && !np->cork.opt->dst0opt)
1145 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1147 if (opt->dst1opt && !np->cork.opt->dst1opt)
1150 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1152 if (opt->hopopt && !np->cork.opt->hopopt)
1155 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1157 if (opt->srcrt && !np->cork.opt->srcrt)
1160 /* need source address above miyazawa*/
1163 cork->dst = &rt->dst;
1164 inet->cork.fl.u.ip6 = *fl6;
1165 np->cork.hop_limit = hlimit;
1166 np->cork.tclass = tclass;
1167 if (rt->dst.flags & DST_XFRM_TUNNEL)
1168 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1169 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1171 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1172 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1173 if (np->frag_size < mtu) {
1175 mtu = np->frag_size;
1177 cork->fragsize = mtu;
1178 if (dst_allfrag(rt->dst.path))
1179 cork->flags |= IPCORK_ALLFRAG;
1181 exthdrlen = (opt ? opt->opt_flen : 0);
1182 length += exthdrlen;
1183 transhdrlen += exthdrlen;
1184 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1186 rt = (struct rt6_info *)cork->dst;
1187 fl6 = &inet->cork.fl.u.ip6;
1192 mtu = cork->fragsize;
1195 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1197 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1198 (opt ? opt->opt_nflen : 0);
1199 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1201 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1202 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1203 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1208 /* For UDP, check if TX timestamp is enabled */
1209 if (sk->sk_type == SOCK_DGRAM)
1210 sock_tx_timestamp(sk, &tx_flags);
1213 * Let's try using as much space as possible.
1214 * Use MTU if total length of the message fits into the MTU.
1215 * Otherwise, we need to reserve fragment header and
1216 * fragment alignment (= 8-15 octects, in total).
1218 * Note that we may need to "move" the data from the tail of
1219 * of the buffer to the new fragment when we split
1222 * FIXME: It may be fragmented into multiple chunks
1223 * at once if non-fragmentable extension headers
1228 cork->length += length;
1230 int proto = sk->sk_protocol;
1231 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1232 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1236 if (proto == IPPROTO_UDP &&
1237 (rt->dst.dev->features & NETIF_F_UFO)) {
1239 err = ip6_ufo_append_data(sk, getfrag, from, length,
1240 hh_len, fragheaderlen,
1241 transhdrlen, mtu, flags, rt);
1248 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1251 while (length > 0) {
1252 /* Check if the remaining data fits into current packet. */
1253 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1255 copy = maxfraglen - skb->len;
1259 unsigned int datalen;
1260 unsigned int fraglen;
1261 unsigned int fraggap;
1262 unsigned int alloclen;
1264 /* There's no room in the current skb */
1266 fraggap = skb->len - maxfraglen;
1269 /* update mtu and maxfraglen if necessary */
1270 if (skb == NULL || skb_prev == NULL)
1271 ip6_append_data_mtu(&mtu, &maxfraglen,
1272 fragheaderlen, skb, rt,
1274 IPV6_PMTUDISC_PROBE);
1279 * If remaining data exceeds the mtu,
1280 * we know we need more fragment(s).
1282 datalen = length + fraggap;
1284 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1285 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1286 if ((flags & MSG_MORE) &&
1287 !(rt->dst.dev->features&NETIF_F_SG))
1290 alloclen = datalen + fragheaderlen;
1292 alloclen += dst_exthdrlen;
1294 if (datalen != length + fraggap) {
1296 * this is not the last fragment, the trailer
1297 * space is regarded as data space.
1299 datalen += rt->dst.trailer_len;
1302 alloclen += rt->dst.trailer_len;
1303 fraglen = datalen + fragheaderlen;
1306 * We just reserve space for fragment header.
1307 * Note: this may be overallocation if the message
1308 * (without MSG_MORE) fits into the MTU.
1310 alloclen += sizeof(struct frag_hdr);
1313 skb = sock_alloc_send_skb(sk,
1315 (flags & MSG_DONTWAIT), &err);
1318 if (atomic_read(&sk->sk_wmem_alloc) <=
1320 skb = sock_wmalloc(sk,
1321 alloclen + hh_len, 1,
1323 if (unlikely(skb == NULL))
1326 /* Only the initial fragment
1335 * Fill in the control structures
1337 skb->ip_summed = CHECKSUM_NONE;
1339 /* reserve for fragmentation and ipsec header */
1340 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1343 if (sk->sk_type == SOCK_DGRAM)
1344 skb_shinfo(skb)->tx_flags = tx_flags;
1347 * Find where to start putting bytes
1349 data = skb_put(skb, fraglen);
1350 skb_set_network_header(skb, exthdrlen);
1351 data += fragheaderlen;
1352 skb->transport_header = (skb->network_header +
1355 skb->csum = skb_copy_and_csum_bits(
1356 skb_prev, maxfraglen,
1357 data + transhdrlen, fraggap, 0);
1358 skb_prev->csum = csum_sub(skb_prev->csum,
1361 pskb_trim_unique(skb_prev, maxfraglen);
1363 copy = datalen - transhdrlen - fraggap;
1369 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1376 length -= datalen - fraggap;
1382 * Put the packet on the pending queue
1384 __skb_queue_tail(&sk->sk_write_queue, skb);
1391 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1395 if (getfrag(from, skb_put(skb, copy),
1396 offset, copy, off, skb) < 0) {
1397 __skb_trim(skb, off);
1402 int i = skb_shinfo(skb)->nr_frags;
1403 struct page_frag *pfrag = sk_page_frag(sk);
1406 if (!sk_page_frag_refill(sk, pfrag))
1409 if (!skb_can_coalesce(skb, i, pfrag->page,
1412 if (i == MAX_SKB_FRAGS)
1415 __skb_fill_page_desc(skb, i, pfrag->page,
1417 skb_shinfo(skb)->nr_frags = ++i;
1418 get_page(pfrag->page);
1420 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1422 page_address(pfrag->page) + pfrag->offset,
1423 offset, copy, skb->len, skb) < 0)
1426 pfrag->offset += copy;
1427 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1429 skb->data_len += copy;
1430 skb->truesize += copy;
1431 atomic_add(copy, &sk->sk_wmem_alloc);
1442 cork->length -= length;
1443 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1446 EXPORT_SYMBOL_GPL(ip6_append_data);
1448 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1451 kfree(np->cork.opt->dst0opt);
1452 kfree(np->cork.opt->dst1opt);
1453 kfree(np->cork.opt->hopopt);
1454 kfree(np->cork.opt->srcrt);
1455 kfree(np->cork.opt);
1456 np->cork.opt = NULL;
1459 if (inet->cork.base.dst) {
1460 dst_release(inet->cork.base.dst);
1461 inet->cork.base.dst = NULL;
1462 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1464 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1467 int ip6_push_pending_frames(struct sock *sk)
1469 struct sk_buff *skb, *tmp_skb;
1470 struct sk_buff **tail_skb;
1471 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1472 struct inet_sock *inet = inet_sk(sk);
1473 struct ipv6_pinfo *np = inet6_sk(sk);
1474 struct net *net = sock_net(sk);
1475 struct ipv6hdr *hdr;
1476 struct ipv6_txoptions *opt = np->cork.opt;
1477 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1478 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1479 unsigned char proto = fl6->flowi6_proto;
1482 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1484 tail_skb = &(skb_shinfo(skb)->frag_list);
1486 /* move skb->data to ip header from ext header */
1487 if (skb->data < skb_network_header(skb))
1488 __skb_pull(skb, skb_network_offset(skb));
1489 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1490 __skb_pull(tmp_skb, skb_network_header_len(skb));
1491 *tail_skb = tmp_skb;
1492 tail_skb = &(tmp_skb->next);
1493 skb->len += tmp_skb->len;
1494 skb->data_len += tmp_skb->len;
1495 skb->truesize += tmp_skb->truesize;
1496 tmp_skb->destructor = NULL;
1500 /* Allow local fragmentation. */
1501 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1504 *final_dst = fl6->daddr;
1505 __skb_pull(skb, skb_network_header_len(skb));
1506 if (opt && opt->opt_flen)
1507 ipv6_push_frag_opts(skb, opt, &proto);
1508 if (opt && opt->opt_nflen)
1509 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1511 skb_push(skb, sizeof(struct ipv6hdr));
1512 skb_reset_network_header(skb);
1513 hdr = ipv6_hdr(skb);
1515 ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1516 hdr->hop_limit = np->cork.hop_limit;
1517 hdr->nexthdr = proto;
1518 hdr->saddr = fl6->saddr;
1519 hdr->daddr = *final_dst;
1521 skb->priority = sk->sk_priority;
1522 skb->mark = sk->sk_mark;
1524 skb_dst_set(skb, dst_clone(&rt->dst));
1525 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1526 if (proto == IPPROTO_ICMPV6) {
1527 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1529 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1530 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1533 err = ip6_local_out(skb);
1536 err = net_xmit_errno(err);
1542 ip6_cork_release(inet, np);
1545 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1548 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1550 void ip6_flush_pending_frames(struct sock *sk)
1552 struct sk_buff *skb;
1554 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1556 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1557 IPSTATS_MIB_OUTDISCARDS);
1561 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1563 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);