2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The Internet Protocol (IP) output module.
8 * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Hirokazu Takahashi, <taka@valinux.co.jp>
20 * See ip_input.c for original log
23 * Alan Cox : Missing nonblock feature in ip_build_xmit.
24 * Mike Kilburn : htons() missing in ip_build_xmit.
25 * Bradford Johnson: Fix faulty handling of some frames when
27 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
28 * (in case if packet not accepted by
29 * output firewall rules)
30 * Mike McLagan : Routing by source
31 * Alexey Kuznetsov: use new route cache
32 * Andi Kleen: Fix broken PMTU recovery and remove
33 * some redundant tests.
34 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
35 * Andi Kleen : Replace ip_reply with ip_send_reply.
36 * Andi Kleen : Split fast and slow ip_build_xmit path
37 * for decreased register pressure on x86
38 * and more readibility.
39 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
40 * silently drop skb instead of failing with -EPERM.
41 * Detlev Wengorz : Copy protocol for fragments.
42 * Hirokazu Takahashi: HW checksumming for outgoing UDP
44 * Hirokazu Takahashi: sendfile() on UDP works now.
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
74 #include <linux/skbuff.h>
79 #include <net/checksum.h>
80 #include <net/inetpeer.h>
81 #include <net/checksum.h>
82 #include <linux/igmp.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/netfilter_bridge.h>
85 #include <linux/mroute.h>
86 #include <linux/netlink.h>
89 * Shall we try to damage output packets if routing dev changes?
92 int sysctl_ip_dynaddr;
93 int sysctl_ip_default_ttl = IPDEFTTL;
95 /* Generate a checksum for an outgoing IP datagram. */
96 __inline__ void ip_send_check(struct iphdr *iph)
99 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
102 /* dev_loopback_xmit for use with netfilter. */
103 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
105 newskb->mac.raw = newskb->data;
106 __skb_pull(newskb, newskb->nh.raw - newskb->data);
107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst);
115 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
117 int ttl = inet->uc_ttl;
120 ttl = dst_metric(dst, RTAX_HOPLIMIT);
125 * Add an ip header to a skbuff and send it out.
128 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
129 u32 saddr, u32 daddr, struct ip_options *opt)
131 struct inet_sock *inet = inet_sk(sk);
132 struct rtable *rt = (struct rtable *)skb->dst;
135 /* Build the IP header. */
137 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
139 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
143 iph->tos = inet->tos;
144 if (ip_dont_fragment(sk, &rt->u.dst))
145 iph->frag_off = htons(IP_DF);
148 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
149 iph->daddr = rt->rt_dst;
150 iph->saddr = rt->rt_src;
151 iph->protocol = sk->sk_protocol;
152 iph->tot_len = htons(skb->len);
153 ip_select_ident(iph, &rt->u.dst, sk);
156 if (opt && opt->optlen) {
157 iph->ihl += opt->optlen>>2;
158 ip_options_build(skb, opt, daddr, rt, 0);
162 skb->priority = sk->sk_priority;
165 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
169 static inline int ip_finish_output2(struct sk_buff *skb)
171 struct dst_entry *dst = skb->dst;
172 struct hh_cache *hh = dst->hh;
173 struct net_device *dev = dst->dev;
174 int hh_len = LL_RESERVED_SPACE(dev);
176 /* Be paranoid, rather than too clever. */
177 if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
178 struct sk_buff *skb2;
180 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
186 skb_set_owner_w(skb2, skb->sk);
191 #ifdef CONFIG_BRIDGE_NETFILTER
192 /* bridge-netfilter defers calling some IP hooks to the bridge layer
193 * and still needs the conntrack reference.
195 if (skb->nf_bridge == NULL)
202 read_lock_bh(&hh->hh_lock);
203 hh_alen = HH_DATA_ALIGN(hh->hh_len);
204 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
205 read_unlock_bh(&hh->hh_lock);
206 skb_push(skb, hh->hh_len);
207 return hh->hh_output(skb);
208 } else if (dst->neighbour)
209 return dst->neighbour->output(skb);
212 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
217 int ip_finish_output(struct sk_buff *skb)
219 struct net_device *dev = skb->dst->dev;
222 skb->protocol = htons(ETH_P_IP);
224 return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
228 int ip_mc_output(struct sk_buff *skb)
230 struct sock *sk = skb->sk;
231 struct rtable *rt = (struct rtable*)skb->dst;
232 struct net_device *dev = rt->u.dst.dev;
235 * If the indicated interface is up and running, send the packet.
237 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
240 skb->protocol = htons(ETH_P_IP);
243 * Multicasts are looped back for other local users
246 if (rt->rt_flags&RTCF_MULTICAST) {
247 if ((!sk || inet_sk(sk)->mc_loop)
248 #ifdef CONFIG_IP_MROUTE
249 /* Small optimization: do not loopback not local frames,
250 which returned after forwarding; they will be dropped
251 by ip_mr_input in any case.
252 Note, that local frames are looped back to be delivered
255 This check is duplicated in ip_mr_input at the moment.
257 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
260 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
262 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
264 ip_dev_loopback_xmit);
267 /* Multicasts with ttl 0 must not go beyond the host */
269 if (skb->nh.iph->ttl == 0) {
275 if (rt->rt_flags&RTCF_BROADCAST) {
276 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
278 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
279 newskb->dev, ip_dev_loopback_xmit);
282 if (skb->len > dst_mtu(&rt->u.dst))
283 return ip_fragment(skb, ip_finish_output);
285 return ip_finish_output(skb);
288 int ip_output(struct sk_buff *skb)
290 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
292 if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
293 return ip_fragment(skb, ip_finish_output);
295 return ip_finish_output(skb);
298 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
300 struct sock *sk = skb->sk;
301 struct inet_sock *inet = inet_sk(sk);
302 struct ip_options *opt = inet->opt;
306 /* Skip all of this if the packet is already routed,
307 * f.e. by something like SCTP.
309 rt = (struct rtable *) skb->dst;
313 /* Make sure we can route this packet. */
314 rt = (struct rtable *)__sk_dst_check(sk, 0);
318 /* Use correct destination address if we have options. */
324 struct flowi fl = { .oif = sk->sk_bound_dev_if,
327 .saddr = inet->saddr,
328 .tos = RT_CONN_FLAGS(sk) } },
329 .proto = sk->sk_protocol,
331 { .sport = inet->sport,
332 .dport = inet->dport } } };
334 /* If this fails, retransmit mechanism of transport layer will
335 * keep trying until route appears or the connection times
338 if (ip_route_output_flow(&rt, &fl, sk, 0))
341 __sk_dst_set(sk, &rt->u.dst);
342 tcp_v4_setup_caps(sk, &rt->u.dst);
344 skb->dst = dst_clone(&rt->u.dst);
347 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
350 /* OK, we know where to send it, allocate and build IP header. */
351 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
352 *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
353 iph->tot_len = htons(skb->len);
354 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
355 iph->frag_off = htons(IP_DF);
358 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
359 iph->protocol = sk->sk_protocol;
360 iph->saddr = rt->rt_src;
361 iph->daddr = rt->rt_dst;
363 /* Transport layer set skb->h.foo itself. */
365 if (opt && opt->optlen) {
366 iph->ihl += opt->optlen >> 2;
367 ip_options_build(skb, opt, inet->daddr, rt, 0);
370 ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
372 /* Add an IP checksum. */
375 skb->priority = sk->sk_priority;
377 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
381 IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
383 return -EHOSTUNREACH;
387 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
389 to->pkt_type = from->pkt_type;
390 to->priority = from->priority;
391 to->protocol = from->protocol;
392 to->security = from->security;
393 dst_release(to->dst);
394 to->dst = dst_clone(from->dst);
397 /* Copy the flags to each fragment. */
398 IPCB(to)->flags = IPCB(from)->flags;
400 #ifdef CONFIG_NET_SCHED
401 to->tc_index = from->tc_index;
403 #ifdef CONFIG_NETFILTER
404 to->nfmark = from->nfmark;
405 to->nfcache = from->nfcache;
406 /* Connection association is same as pre-frag packet */
407 nf_conntrack_put(to->nfct);
408 to->nfct = from->nfct;
409 nf_conntrack_get(to->nfct);
410 to->nfctinfo = from->nfctinfo;
411 #ifdef CONFIG_BRIDGE_NETFILTER
412 nf_bridge_put(to->nf_bridge);
413 to->nf_bridge = from->nf_bridge;
414 nf_bridge_get(to->nf_bridge);
420 * This IP datagram is too large to be sent in one piece. Break it up into
421 * smaller pieces (each of size equal to IP header plus
422 * a block of the data of the original IP data part) that will yet fit in a
423 * single device frame, and queue such a frame for sending.
426 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
431 struct net_device *dev;
432 struct sk_buff *skb2;
433 unsigned int mtu, hlen, left, len, ll_rs;
436 struct rtable *rt = (struct rtable*)skb->dst;
442 * Point into the IP datagram header.
447 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
448 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
449 htonl(dst_mtu(&rt->u.dst)));
455 * Setup starting values.
459 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
461 /* When frag_list is given, use it. First, check its validity:
462 * some transformers could create wrong frag_list or break existing
463 * one, it is not prohibited. In this case fall back to copying.
465 * LATER: this step can be merged to real generation of fragments,
466 * we can switch to copy when see the first bad fragment.
468 if (skb_shinfo(skb)->frag_list) {
469 struct sk_buff *frag;
470 int first_len = skb_pagelen(skb);
472 if (first_len - hlen > mtu ||
473 ((first_len - hlen) & 7) ||
474 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
478 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
479 /* Correct geometry. */
480 if (frag->len > mtu ||
481 ((frag->len & 7) && frag->next) ||
482 skb_headroom(frag) < hlen)
485 /* Partially cloned skb? */
486 if (skb_shared(frag))
493 frag->destructor = sock_wfree;
494 skb->truesize -= frag->truesize;
498 /* Everything is OK. Generate! */
502 frag = skb_shinfo(skb)->frag_list;
503 skb_shinfo(skb)->frag_list = NULL;
504 skb->data_len = first_len - skb_headlen(skb);
505 skb->len = first_len;
506 iph->tot_len = htons(first_len);
507 iph->frag_off = htons(IP_MF);
511 /* Prepare header of the next frame,
512 * before previous one went down. */
514 frag->ip_summed = CHECKSUM_NONE;
515 frag->h.raw = frag->data;
516 frag->nh.raw = __skb_push(frag, hlen);
517 memcpy(frag->nh.raw, iph, hlen);
519 iph->tot_len = htons(frag->len);
520 ip_copy_metadata(frag, skb);
522 ip_options_fragment(frag);
523 offset += skb->len - hlen;
524 iph->frag_off = htons(offset>>3);
525 if (frag->next != NULL)
526 iph->frag_off |= htons(IP_MF);
527 /* Ready, complete checksum */
542 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
551 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
556 left = skb->len - hlen; /* Space per frame */
557 ptr = raw + hlen; /* Where to start from */
559 #ifdef CONFIG_BRIDGE_NETFILTER
560 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
561 * we need to make room for the encapsulating header */
562 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
563 mtu -= nf_bridge_pad(skb);
565 ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
568 * Fragment the datagram.
571 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
572 not_last_frag = iph->frag_off & htons(IP_MF);
575 * Keep copying data until we run out.
580 /* IF: it doesn't fit, use 'mtu' - the data space left */
583 /* IF: we are not sending upto and including the packet end
584 then align the next start on an eight byte boundary */
592 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
593 NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
599 * Set up data on packet
602 ip_copy_metadata(skb2, skb);
603 skb_reserve(skb2, ll_rs);
604 skb_put(skb2, len + hlen);
605 skb2->nh.raw = skb2->data;
606 skb2->h.raw = skb2->data + hlen;
609 * Charge the memory for the fragment to any owner
614 skb_set_owner_w(skb2, skb->sk);
617 * Copy the packet header into the new buffer.
620 memcpy(skb2->nh.raw, skb->data, hlen);
623 * Copy a block of the IP datagram.
625 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
630 * Fill in the new header fields.
633 iph->frag_off = htons((offset >> 3));
635 /* ANK: dirty, but effective trick. Upgrade options only if
636 * the segment to be fragmented was THE FIRST (otherwise,
637 * options are already fixed) and make it ONCE
638 * on the initial skb, so that all the following fragments
639 * will inherit fixed options.
642 ip_options_fragment(skb);
645 * Added AC : If we are fragmenting a fragment that's not the
646 * last fragment then keep MF on each bit
648 if (left > 0 || not_last_frag)
649 iph->frag_off |= htons(IP_MF);
654 * Put this fragment into the sending queue.
657 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
659 iph->tot_len = htons(len + hlen);
668 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
673 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
678 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
680 struct iovec *iov = from;
682 if (skb->ip_summed == CHECKSUM_HW) {
683 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
686 unsigned int csum = 0;
687 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
689 skb->csum = csum_block_add(skb->csum, csum, odd);
694 static inline unsigned int
695 csum_page(struct page *page, int offset, int copy)
700 csum = csum_partial(kaddr + offset, copy, 0);
706 * ip_append_data() and ip_append_page() can make one large IP datagram
707 * from many pieces of data. Each pieces will be holded on the socket
708 * until ip_push_pending_frames() is called. Each piece can be a page
711 * Not only UDP, other transport protocols - e.g. raw sockets - can use
712 * this interface potentially.
714 * LATER: length must be adjusted by pad at tail, when it is required.
716 int ip_append_data(struct sock *sk,
717 int getfrag(void *from, char *to, int offset, int len,
718 int odd, struct sk_buff *skb),
719 void *from, int length, int transhdrlen,
720 struct ipcm_cookie *ipc, struct rtable *rt,
723 struct inet_sock *inet = inet_sk(sk);
726 struct ip_options *opt = NULL;
733 unsigned int maxfraglen, fragheaderlen;
734 int csummode = CHECKSUM_NONE;
739 if (skb_queue_empty(&sk->sk_write_queue)) {
745 if (inet->cork.opt == NULL) {
746 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
747 if (unlikely(inet->cork.opt == NULL))
750 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
751 inet->cork.flags |= IPCORK_OPT;
752 inet->cork.addr = ipc->addr;
754 dst_hold(&rt->u.dst);
755 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
757 inet->cork.length = 0;
758 sk->sk_sndmsg_page = NULL;
759 sk->sk_sndmsg_off = 0;
760 if ((exthdrlen = rt->u.dst.header_len) != 0) {
762 transhdrlen += exthdrlen;
766 if (inet->cork.flags & IPCORK_OPT)
767 opt = inet->cork.opt;
771 mtu = inet->cork.fragsize;
773 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
775 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
776 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
778 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
779 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
784 * transhdrlen > 0 means that this is the first fragment and we wish
785 * it won't be fragmented in the future.
788 length + fragheaderlen <= mtu &&
789 rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
791 csummode = CHECKSUM_HW;
793 inet->cork.length += length;
795 /* So, what's going on in the loop below?
797 * We use calculated fragment length to generate chained skb,
798 * each of segments is IP fragment ready for sending to network after
799 * adding appropriate IP header.
802 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
806 /* Check if the remaining data fits into current packet. */
807 copy = mtu - skb->len;
809 copy = maxfraglen - skb->len;
812 unsigned int datalen;
813 unsigned int fraglen;
814 unsigned int fraggap;
815 unsigned int alloclen;
816 struct sk_buff *skb_prev;
820 fraggap = skb_prev->len - maxfraglen;
825 * If remaining data exceeds the mtu,
826 * we know we need more fragment(s).
828 datalen = length + fraggap;
829 if (datalen > mtu - fragheaderlen)
830 datalen = maxfraglen - fragheaderlen;
831 fraglen = datalen + fragheaderlen;
833 if ((flags & MSG_MORE) &&
834 !(rt->u.dst.dev->features&NETIF_F_SG))
837 alloclen = datalen + fragheaderlen;
839 /* The last fragment gets additional space at tail.
840 * Note, with MSG_MORE we overallocate on fragments,
841 * because we have no idea what fragment will be
844 if (datalen == length)
845 alloclen += rt->u.dst.trailer_len;
848 skb = sock_alloc_send_skb(sk,
849 alloclen + hh_len + 15,
850 (flags & MSG_DONTWAIT), &err);
853 if (atomic_read(&sk->sk_wmem_alloc) <=
855 skb = sock_wmalloc(sk,
856 alloclen + hh_len + 15, 1,
858 if (unlikely(skb == NULL))
865 * Fill in the control structures
867 skb->ip_summed = csummode;
869 skb_reserve(skb, hh_len);
872 * Find where to start putting bytes.
874 data = skb_put(skb, fraglen);
875 skb->nh.raw = data + exthdrlen;
876 data += fragheaderlen;
877 skb->h.raw = data + exthdrlen;
880 skb->csum = skb_copy_and_csum_bits(
881 skb_prev, maxfraglen,
882 data + transhdrlen, fraggap, 0);
883 skb_prev->csum = csum_sub(skb_prev->csum,
886 skb_trim(skb_prev, maxfraglen);
889 copy = datalen - transhdrlen - fraggap;
890 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
897 length -= datalen - fraggap;
900 csummode = CHECKSUM_NONE;
903 * Put the packet on the pending queue.
905 __skb_queue_tail(&sk->sk_write_queue, skb);
912 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
916 if (getfrag(from, skb_put(skb, copy),
917 offset, copy, off, skb) < 0) {
918 __skb_trim(skb, off);
923 int i = skb_shinfo(skb)->nr_frags;
924 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
925 struct page *page = sk->sk_sndmsg_page;
926 int off = sk->sk_sndmsg_off;
929 if (page && (left = PAGE_SIZE - off) > 0) {
932 if (page != frag->page) {
933 if (i == MAX_SKB_FRAGS) {
938 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
939 frag = &skb_shinfo(skb)->frags[i];
941 } else if (i < MAX_SKB_FRAGS) {
942 if (copy > PAGE_SIZE)
944 page = alloc_pages(sk->sk_allocation, 0);
949 sk->sk_sndmsg_page = page;
950 sk->sk_sndmsg_off = 0;
952 skb_fill_page_desc(skb, i, page, 0, 0);
953 frag = &skb_shinfo(skb)->frags[i];
954 skb->truesize += PAGE_SIZE;
955 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
960 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
964 sk->sk_sndmsg_off += copy;
967 skb->data_len += copy;
976 inet->cork.length -= length;
977 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
981 ssize_t ip_append_page(struct sock *sk, struct page *page,
982 int offset, size_t size, int flags)
984 struct inet_sock *inet = inet_sk(sk);
987 struct ip_options *opt = NULL;
992 unsigned int maxfraglen, fragheaderlen, fraggap;
1000 if (skb_queue_empty(&sk->sk_write_queue))
1004 if (inet->cork.flags & IPCORK_OPT)
1005 opt = inet->cork.opt;
1007 if (!(rt->u.dst.dev->features&NETIF_F_SG))
1010 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1011 mtu = inet->cork.fragsize;
1013 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1014 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1016 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1017 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1021 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1024 inet->cork.length += size;
1029 /* Check if the remaining data fits into current packet. */
1030 len = mtu - skb->len;
1032 len = maxfraglen - skb->len;
1034 struct sk_buff *skb_prev;
1041 fraggap = skb_prev->len - maxfraglen;
1045 alloclen = fragheaderlen + hh_len + fraggap + 15;
1046 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1047 if (unlikely(!skb)) {
1053 * Fill in the control structures
1055 skb->ip_summed = CHECKSUM_NONE;
1057 skb_reserve(skb, hh_len);
1060 * Find where to start putting bytes.
1062 data = skb_put(skb, fragheaderlen + fraggap);
1063 skb->nh.iph = iph = (struct iphdr *)data;
1064 data += fragheaderlen;
1068 skb->csum = skb_copy_and_csum_bits(
1069 skb_prev, maxfraglen,
1071 skb_prev->csum = csum_sub(skb_prev->csum,
1073 skb_trim(skb_prev, maxfraglen);
1077 * Put the packet on the pending queue.
1079 __skb_queue_tail(&sk->sk_write_queue, skb);
1083 i = skb_shinfo(skb)->nr_frags;
1086 if (skb_can_coalesce(skb, i, page, offset)) {
1087 skb_shinfo(skb)->frags[i-1].size += len;
1088 } else if (i < MAX_SKB_FRAGS) {
1090 skb_fill_page_desc(skb, i, page, offset, len);
1096 if (skb->ip_summed == CHECKSUM_NONE) {
1098 csum = csum_page(page, offset, len);
1099 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1103 skb->data_len += len;
1110 inet->cork.length -= size;
1111 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1116 * Combined all pending IP fragments on the socket as one IP datagram
1117 * and push them out.
1119 int ip_push_pending_frames(struct sock *sk)
1121 struct sk_buff *skb, *tmp_skb;
1122 struct sk_buff **tail_skb;
1123 struct inet_sock *inet = inet_sk(sk);
1124 struct ip_options *opt = NULL;
1125 struct rtable *rt = inet->cork.rt;
1131 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1133 tail_skb = &(skb_shinfo(skb)->frag_list);
1135 /* move skb->data to ip header from ext header */
1136 if (skb->data < skb->nh.raw)
1137 __skb_pull(skb, skb->nh.raw - skb->data);
1138 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1139 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1140 *tail_skb = tmp_skb;
1141 tail_skb = &(tmp_skb->next);
1142 skb->len += tmp_skb->len;
1143 skb->data_len += tmp_skb->len;
1144 skb->truesize += tmp_skb->truesize;
1145 __sock_put(tmp_skb->sk);
1146 tmp_skb->destructor = NULL;
1150 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1151 * to fragment the frame generated here. No matter, what transforms
1152 * how transforms change size of the packet, it will come out.
1154 if (inet->pmtudisc != IP_PMTUDISC_DO)
1157 /* DF bit is set when we want to see DF on outgoing frames.
1158 * If local_df is set too, we still allow to fragment this frame
1160 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1161 (skb->len <= dst_mtu(&rt->u.dst) &&
1162 ip_dont_fragment(sk, &rt->u.dst)))
1165 if (inet->cork.flags & IPCORK_OPT)
1166 opt = inet->cork.opt;
1168 if (rt->rt_type == RTN_MULTICAST)
1171 ttl = ip_select_ttl(inet, &rt->u.dst);
1173 iph = (struct iphdr *)skb->data;
1177 iph->ihl += opt->optlen>>2;
1178 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1180 iph->tos = inet->tos;
1181 iph->tot_len = htons(skb->len);
1184 __ip_select_ident(iph, &rt->u.dst, 0);
1186 iph->id = htons(inet->id++);
1189 iph->protocol = sk->sk_protocol;
1190 iph->saddr = rt->rt_src;
1191 iph->daddr = rt->rt_dst;
1194 skb->priority = sk->sk_priority;
1195 skb->dst = dst_clone(&rt->u.dst);
1197 /* Netfilter gets whole the not fragmented skb. */
1198 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1199 skb->dst->dev, dst_output);
1202 err = inet->recverr ? net_xmit_errno(err) : 0;
1208 inet->cork.flags &= ~IPCORK_OPT;
1209 if (inet->cork.opt) {
1210 kfree(inet->cork.opt);
1211 inet->cork.opt = NULL;
1213 if (inet->cork.rt) {
1214 ip_rt_put(inet->cork.rt);
1215 inet->cork.rt = NULL;
1220 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1225 * Throw away all pending data on the socket.
1227 void ip_flush_pending_frames(struct sock *sk)
1229 struct inet_sock *inet = inet_sk(sk);
1230 struct sk_buff *skb;
1232 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1235 inet->cork.flags &= ~IPCORK_OPT;
1236 if (inet->cork.opt) {
1237 kfree(inet->cork.opt);
1238 inet->cork.opt = NULL;
1240 if (inet->cork.rt) {
1241 ip_rt_put(inet->cork.rt);
1242 inet->cork.rt = NULL;
1248 * Fetch data from kernel space and fill in checksum if needed.
1250 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1251 int len, int odd, struct sk_buff *skb)
1255 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1256 skb->csum = csum_block_add(skb->csum, csum, odd);
1261 * Generic function to send a packet as reply to another packet.
1262 * Used to send TCP resets so far. ICMP should use this function too.
1264 * Should run single threaded per socket because it uses the sock
1265 * structure to pass arguments.
1267 * LATER: switch from ip_build_xmit to ip_append_*
1269 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1272 struct inet_sock *inet = inet_sk(sk);
1274 struct ip_options opt;
1277 struct ipcm_cookie ipc;
1279 struct rtable *rt = (struct rtable*)skb->dst;
1281 if (ip_options_echo(&replyopts.opt, skb))
1284 daddr = ipc.addr = rt->rt_src;
1287 if (replyopts.opt.optlen) {
1288 ipc.opt = &replyopts.opt;
1291 daddr = replyopts.opt.faddr;
1295 struct flowi fl = { .nl_u = { .ip4_u =
1297 .saddr = rt->rt_spec_dst,
1298 .tos = RT_TOS(skb->nh.iph->tos) } },
1299 /* Not quite clean, but right. */
1301 { .sport = skb->h.th->dest,
1302 .dport = skb->h.th->source } },
1303 .proto = sk->sk_protocol };
1304 if (ip_route_output_key(&rt, &fl))
1308 /* And let IP do all the hard work.
1310 This chunk is not reenterable, hence spinlock.
1311 Note that it uses the fact, that this function is called
1312 with locally disabled BH and that sk cannot be already spinlocked.
1315 inet->tos = skb->nh.iph->tos;
1316 sk->sk_priority = skb->priority;
1317 sk->sk_protocol = skb->nh.iph->protocol;
1318 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1319 &ipc, rt, MSG_DONTWAIT);
1320 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1321 if (arg->csumoffset >= 0)
1322 *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1323 skb->ip_summed = CHECKSUM_NONE;
1324 ip_push_pending_frames(sk);
1333 * IP protocol layer initialiser
1336 static struct packet_type ip_packet_type = {
1337 .type = __constant_htons(ETH_P_IP),
1342 * IP registers the packet type and then calls the subprotocol initialisers
1345 void __init ip_init(void)
1347 dev_add_pack(&ip_packet_type);
1352 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1353 igmp_mc_proc_init();
1357 EXPORT_SYMBOL(ip_finish_output);
1358 EXPORT_SYMBOL(ip_fragment);
1359 EXPORT_SYMBOL(ip_generic_getfrag);
1360 EXPORT_SYMBOL(ip_queue_xmit);
1361 EXPORT_SYMBOL(ip_send_check);
1363 #ifdef CONFIG_SYSCTL
1364 EXPORT_SYMBOL(sysctl_ip_default_ttl);