2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <asm/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/percpu.h>
93 #include <net/inet_common.h>
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
112 Incoming, dev->hard_header!=NULL
113 mac_header -> ll header
116 Outgoing, dev->hard_header!=NULL
117 mac_header -> ll header
120 Incoming, dev->hard_header==NULL
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
123 assymetry between rx and tx paths.
126 Outgoing, dev->hard_header==NULL
127 mac_header -> data. ll header is still not built!
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
137 dev->hard_header != NULL
138 mac_header -> ll header
141 dev->hard_header == NULL (ll header is added by device, we cannot control it)
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
149 /* Private packet socket structures. */
151 /* identical to struct packet_mreq except it has
152 * a longer address field.
154 struct packet_mreq_max {
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
168 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
169 int closing, int tx_ring);
171 #define V3_ALIGNMENT (8)
173 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
175 #define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178 #define PGV_FROM_VMALLOC 1
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
190 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
193 static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
196 static void packet_increment_head(struct packet_ring_buffer *buff);
197 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
200 struct packet_sock *);
201 static void prb_retire_current_block(struct tpacket_kbdq_core *,
202 struct packet_sock *, unsigned int status);
203 static int prb_queue_frozen(struct tpacket_kbdq_core *);
204 static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
206 static void prb_retire_rx_blk_timer_expired(unsigned long);
207 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208 static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
216 static void packet_flush_mclist(struct sock *sk);
218 struct packet_skb_cb {
219 unsigned int origlen;
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
226 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
228 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
229 #define GET_PBLOCK_DESC(x, bid) \
230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
231 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
233 #define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
237 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238 static void __fanout_link(struct sock *sk, struct packet_sock *po);
240 static int packet_direct_xmit(struct sk_buff *skb)
242 struct net_device *dev = skb->dev;
243 const struct net_device_ops *ops = dev->netdev_ops;
244 netdev_features_t features;
245 struct netdev_queue *txq;
246 int ret = NETDEV_TX_BUSY;
249 if (unlikely(!netif_running(dev) ||
250 !netif_carrier_ok(dev)))
253 features = netif_skb_features(skb);
254 if (skb_needs_linearize(skb, features) &&
255 __skb_linearize(skb))
258 queue_map = skb_get_queue_mapping(skb);
259 txq = netdev_get_tx_queue(dev, queue_map);
263 HARD_TX_LOCK(dev, txq, smp_processor_id());
264 if (!netif_xmit_frozen_or_stopped(txq)) {
265 ret = ops->ndo_start_xmit(skb, dev);
266 if (ret == NETDEV_TX_OK)
267 txq_trans_update(txq);
269 HARD_TX_UNLOCK(dev, txq);
273 if (!dev_xmit_complete(ret))
279 return NET_XMIT_DROP;
282 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
284 struct net_device *dev;
287 dev = rcu_dereference(po->cached_dev);
295 static void packet_cached_dev_assign(struct packet_sock *po,
296 struct net_device *dev)
298 rcu_assign_pointer(po->cached_dev, dev);
301 static void packet_cached_dev_reset(struct packet_sock *po)
303 RCU_INIT_POINTER(po->cached_dev, NULL);
306 static bool packet_use_direct_xmit(const struct packet_sock *po)
308 return po->xmit == packet_direct_xmit;
311 static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
313 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
316 static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
318 const struct net_device_ops *ops = dev->netdev_ops;
321 if (ops->ndo_select_queue) {
322 queue_index = ops->ndo_select_queue(dev, skb, NULL,
323 __packet_pick_tx_queue);
324 queue_index = netdev_cap_txqueue(dev, queue_index);
326 queue_index = __packet_pick_tx_queue(dev, skb);
329 skb_set_queue_mapping(skb, queue_index);
332 /* register_prot_hook must be invoked with the po->bind_lock held,
333 * or from a context in which asynchronous accesses to the packet
334 * socket is not possible (packet_create()).
336 static void register_prot_hook(struct sock *sk)
338 struct packet_sock *po = pkt_sk(sk);
342 __fanout_link(sk, po);
344 dev_add_pack(&po->prot_hook);
351 /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
352 * held. If the sync parameter is true, we will temporarily drop
353 * the po->bind_lock and do a synchronize_net to make sure no
354 * asynchronous packet processing paths still refer to the elements
355 * of po->prot_hook. If the sync parameter is false, it is the
356 * callers responsibility to take care of this.
358 static void __unregister_prot_hook(struct sock *sk, bool sync)
360 struct packet_sock *po = pkt_sk(sk);
365 __fanout_unlink(sk, po);
367 __dev_remove_pack(&po->prot_hook);
372 spin_unlock(&po->bind_lock);
374 spin_lock(&po->bind_lock);
378 static void unregister_prot_hook(struct sock *sk, bool sync)
380 struct packet_sock *po = pkt_sk(sk);
383 __unregister_prot_hook(sk, sync);
386 static inline __pure struct page *pgv_to_page(void *addr)
388 if (is_vmalloc_addr(addr))
389 return vmalloc_to_page(addr);
390 return virt_to_page(addr);
393 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
395 union tpacket_uhdr h;
398 switch (po->tp_version) {
400 h.h1->tp_status = status;
401 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
404 h.h2->tp_status = status;
405 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
409 WARN(1, "TPACKET version not supported.\n");
416 static int __packet_get_status(struct packet_sock *po, void *frame)
418 union tpacket_uhdr h;
423 switch (po->tp_version) {
425 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
426 return h.h1->tp_status;
428 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
429 return h.h2->tp_status;
432 WARN(1, "TPACKET version not supported.\n");
438 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
441 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
444 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
445 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
446 return TP_STATUS_TS_SYS_HARDWARE;
447 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
448 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
449 return TP_STATUS_TS_RAW_HARDWARE;
452 if (ktime_to_timespec_cond(skb->tstamp, ts))
453 return TP_STATUS_TS_SOFTWARE;
458 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
461 union tpacket_uhdr h;
465 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
469 switch (po->tp_version) {
471 h.h1->tp_sec = ts.tv_sec;
472 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
475 h.h2->tp_sec = ts.tv_sec;
476 h.h2->tp_nsec = ts.tv_nsec;
480 WARN(1, "TPACKET version not supported.\n");
484 /* one flush is safe, as both fields always lie on the same cacheline */
485 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
491 static void *packet_lookup_frame(struct packet_sock *po,
492 struct packet_ring_buffer *rb,
493 unsigned int position,
496 unsigned int pg_vec_pos, frame_offset;
497 union tpacket_uhdr h;
499 pg_vec_pos = position / rb->frames_per_block;
500 frame_offset = position % rb->frames_per_block;
502 h.raw = rb->pg_vec[pg_vec_pos].buffer +
503 (frame_offset * rb->frame_size);
505 if (status != __packet_get_status(po, h.raw))
511 static void *packet_current_frame(struct packet_sock *po,
512 struct packet_ring_buffer *rb,
515 return packet_lookup_frame(po, rb, rb->head, status);
518 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
520 del_timer_sync(&pkc->retire_blk_timer);
523 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
525 struct sk_buff_head *rb_queue)
527 struct tpacket_kbdq_core *pkc;
529 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
530 GET_PBDQC_FROM_RB(&po->rx_ring);
532 spin_lock_bh(&rb_queue->lock);
533 pkc->delete_blk_timer = 1;
534 spin_unlock_bh(&rb_queue->lock);
536 prb_del_retire_blk_timer(pkc);
539 static void prb_init_blk_timer(struct packet_sock *po,
540 struct tpacket_kbdq_core *pkc,
541 void (*func) (unsigned long))
543 init_timer(&pkc->retire_blk_timer);
544 pkc->retire_blk_timer.data = (long)po;
545 pkc->retire_blk_timer.function = func;
546 pkc->retire_blk_timer.expires = jiffies;
549 static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
551 struct tpacket_kbdq_core *pkc;
556 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
557 GET_PBDQC_FROM_RB(&po->rx_ring);
558 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
561 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
562 int blk_size_in_bytes)
564 struct net_device *dev;
565 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
566 struct ethtool_cmd ecmd;
571 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
572 if (unlikely(!dev)) {
574 return DEFAULT_PRB_RETIRE_TOV;
576 err = __ethtool_get_settings(dev, &ecmd);
577 speed = ethtool_cmd_speed(&ecmd);
581 * If the link speed is so slow you don't really
582 * need to worry about perf anyways
584 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
585 return DEFAULT_PRB_RETIRE_TOV;
592 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
604 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
605 union tpacket_req_u *req_u)
607 p1->feature_req_word = req_u->req3.tp_feature_req_word;
610 static void init_prb_bdqc(struct packet_sock *po,
611 struct packet_ring_buffer *rb,
613 union tpacket_req_u *req_u, int tx_ring)
615 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
616 struct tpacket_block_desc *pbd;
618 memset(p1, 0x0, sizeof(*p1));
620 p1->knxt_seq_num = 1;
622 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
623 p1->pkblk_start = pg_vec[0].buffer;
624 p1->kblk_size = req_u->req3.tp_block_size;
625 p1->knum_blocks = req_u->req3.tp_block_nr;
626 p1->hdrlen = po->tp_hdrlen;
627 p1->version = po->tp_version;
628 p1->last_kactive_blk_num = 0;
629 po->stats.stats3.tp_freeze_q_cnt = 0;
630 if (req_u->req3.tp_retire_blk_tov)
631 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
633 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
634 req_u->req3.tp_block_size);
635 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
636 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
638 prb_init_ft_ops(p1, req_u);
639 prb_setup_retire_blk_timer(po, tx_ring);
640 prb_open_block(p1, pbd);
643 /* Do NOT update the last_blk_num first.
644 * Assumes sk_buff_head lock is held.
646 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
648 mod_timer(&pkc->retire_blk_timer,
649 jiffies + pkc->tov_in_jiffies);
650 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
655 * 1) We refresh the timer only when we open a block.
656 * By doing this we don't waste cycles refreshing the timer
657 * on packet-by-packet basis.
659 * With a 1MB block-size, on a 1Gbps line, it will take
660 * i) ~8 ms to fill a block + ii) memcpy etc.
661 * In this cut we are not accounting for the memcpy time.
663 * So, if the user sets the 'tmo' to 10ms then the timer
664 * will never fire while the block is still getting filled
665 * (which is what we want). However, the user could choose
666 * to close a block early and that's fine.
668 * But when the timer does fire, we check whether or not to refresh it.
669 * Since the tmo granularity is in msecs, it is not too expensive
670 * to refresh the timer, lets say every '8' msecs.
671 * Either the user can set the 'tmo' or we can derive it based on
672 * a) line-speed and b) block-size.
673 * prb_calc_retire_blk_tmo() calculates the tmo.
676 static void prb_retire_rx_blk_timer_expired(unsigned long data)
678 struct packet_sock *po = (struct packet_sock *)data;
679 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
681 struct tpacket_block_desc *pbd;
683 spin_lock(&po->sk.sk_receive_queue.lock);
685 frozen = prb_queue_frozen(pkc);
686 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
688 if (unlikely(pkc->delete_blk_timer))
691 /* We only need to plug the race when the block is partially filled.
693 * lock(); increment BLOCK_NUM_PKTS; unlock()
694 * copy_bits() is in progress ...
695 * timer fires on other cpu:
696 * we can't retire the current block because copy_bits
700 if (BLOCK_NUM_PKTS(pbd)) {
701 while (atomic_read(&pkc->blk_fill_in_prog)) {
702 /* Waiting for skb_copy_bits to finish... */
707 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
709 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
710 if (!prb_dispatch_next_block(pkc, po))
715 /* Case 1. Queue was frozen because user-space was
718 if (prb_curr_blk_in_use(pkc, pbd)) {
720 * Ok, user-space is still behind.
721 * So just refresh the timer.
725 /* Case 2. queue was frozen,user-space caught up,
726 * now the link went idle && the timer fired.
727 * We don't have a block to close.So we open this
728 * block and restart the timer.
729 * opening a block thaws the queue,restarts timer
730 * Thawing/timer-refresh is a side effect.
732 prb_open_block(pkc, pbd);
739 _prb_refresh_rx_retire_blk_timer(pkc);
742 spin_unlock(&po->sk.sk_receive_queue.lock);
745 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
746 struct tpacket_block_desc *pbd1, __u32 status)
748 /* Flush everything minus the block header */
750 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
755 /* Skip the block header(we know header WILL fit in 4K) */
758 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
759 for (; start < end; start += PAGE_SIZE)
760 flush_dcache_page(pgv_to_page(start));
765 /* Now update the block status. */
767 BLOCK_STATUS(pbd1) = status;
769 /* Flush the block header */
771 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
773 flush_dcache_page(pgv_to_page(start));
783 * 2) Increment active_blk_num
785 * Note:We DONT refresh the timer on purpose.
786 * Because almost always the next block will be opened.
788 static void prb_close_block(struct tpacket_kbdq_core *pkc1,
789 struct tpacket_block_desc *pbd1,
790 struct packet_sock *po, unsigned int stat)
792 __u32 status = TP_STATUS_USER | stat;
794 struct tpacket3_hdr *last_pkt;
795 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
797 if (po->stats.stats3.tp_drops)
798 status |= TP_STATUS_LOSING;
800 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
801 last_pkt->tp_next_offset = 0;
803 /* Get the ts of the last pkt */
804 if (BLOCK_NUM_PKTS(pbd1)) {
805 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
806 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
808 /* Ok, we tmo'd - so get the current time */
811 h1->ts_last_pkt.ts_sec = ts.tv_sec;
812 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
817 /* Flush the block */
818 prb_flush_block(pkc1, pbd1, status);
820 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
823 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
825 pkc->reset_pending_on_curr_blk = 0;
829 * Side effect of opening a block:
831 * 1) prb_queue is thawed.
832 * 2) retire_blk_timer is refreshed.
835 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
836 struct tpacket_block_desc *pbd1)
839 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
843 /* We could have just memset this but we will lose the
844 * flexibility of making the priv area sticky
847 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
848 BLOCK_NUM_PKTS(pbd1) = 0;
849 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
853 h1->ts_first_pkt.ts_sec = ts.tv_sec;
854 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
856 pkc1->pkblk_start = (char *)pbd1;
857 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
859 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
860 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
862 pbd1->version = pkc1->version;
863 pkc1->prev = pkc1->nxt_offset;
864 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
866 prb_thaw_queue(pkc1);
867 _prb_refresh_rx_retire_blk_timer(pkc1);
873 * Queue freeze logic:
874 * 1) Assume tp_block_nr = 8 blocks.
875 * 2) At time 't0', user opens Rx ring.
876 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
877 * 4) user-space is either sleeping or processing block '0'.
878 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
879 * it will close block-7,loop around and try to fill block '0'.
881 * __packet_lookup_frame_in_block
882 * prb_retire_current_block()
883 * prb_dispatch_next_block()
884 * |->(BLOCK_STATUS == USER) evaluates to true
885 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
886 * 6) Now there are two cases:
887 * 6.1) Link goes idle right after the queue is frozen.
888 * But remember, the last open_block() refreshed the timer.
889 * When this timer expires,it will refresh itself so that we can
890 * re-open block-0 in near future.
891 * 6.2) Link is busy and keeps on receiving packets. This is a simple
892 * case and __packet_lookup_frame_in_block will check if block-0
893 * is free and can now be re-used.
895 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
896 struct packet_sock *po)
898 pkc->reset_pending_on_curr_blk = 1;
899 po->stats.stats3.tp_freeze_q_cnt++;
902 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
905 * If the next block is free then we will dispatch it
906 * and return a good offset.
907 * Else, we will freeze the queue.
908 * So, caller must check the return value.
910 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
911 struct packet_sock *po)
913 struct tpacket_block_desc *pbd;
917 /* 1. Get current block num */
918 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
920 /* 2. If this block is currently in_use then freeze the queue */
921 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
922 prb_freeze_queue(pkc, po);
928 * open this block and return the offset where the first packet
929 * needs to get stored.
931 prb_open_block(pkc, pbd);
932 return (void *)pkc->nxt_offset;
935 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
936 struct packet_sock *po, unsigned int status)
938 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
940 /* retire/close the current block */
941 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
943 * Plug the case where copy_bits() is in progress on
944 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
945 * have space to copy the pkt in the current block and
946 * called prb_retire_current_block()
948 * We don't need to worry about the TMO case because
949 * the timer-handler already handled this case.
951 if (!(status & TP_STATUS_BLK_TMO)) {
952 while (atomic_read(&pkc->blk_fill_in_prog)) {
953 /* Waiting for skb_copy_bits to finish... */
957 prb_close_block(pkc, pbd, po, status);
962 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
963 struct tpacket_block_desc *pbd)
965 return TP_STATUS_USER & BLOCK_STATUS(pbd);
968 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
970 return pkc->reset_pending_on_curr_blk;
973 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
975 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
976 atomic_dec(&pkc->blk_fill_in_prog);
979 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
980 struct tpacket3_hdr *ppd)
982 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
985 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
986 struct tpacket3_hdr *ppd)
988 ppd->hv1.tp_rxhash = 0;
991 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
992 struct tpacket3_hdr *ppd)
994 if (vlan_tx_tag_present(pkc->skb)) {
995 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
996 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
997 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
999 ppd->hv1.tp_vlan_tci = 0;
1000 ppd->hv1.tp_vlan_tpid = 0;
1001 ppd->tp_status = TP_STATUS_AVAILABLE;
1005 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
1006 struct tpacket3_hdr *ppd)
1008 ppd->hv1.tp_padding = 0;
1009 prb_fill_vlan_info(pkc, ppd);
1011 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1012 prb_fill_rxhash(pkc, ppd);
1014 prb_clear_rxhash(pkc, ppd);
1017 static void prb_fill_curr_block(char *curr,
1018 struct tpacket_kbdq_core *pkc,
1019 struct tpacket_block_desc *pbd,
1022 struct tpacket3_hdr *ppd;
1024 ppd = (struct tpacket3_hdr *)curr;
1025 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1027 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1028 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1029 BLOCK_NUM_PKTS(pbd) += 1;
1030 atomic_inc(&pkc->blk_fill_in_prog);
1031 prb_run_all_ft_ops(pkc, ppd);
1034 /* Assumes caller has the sk->rx_queue.lock */
1035 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1036 struct sk_buff *skb,
1041 struct tpacket_kbdq_core *pkc;
1042 struct tpacket_block_desc *pbd;
1045 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1046 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1048 /* Queue is frozen when user space is lagging behind */
1049 if (prb_queue_frozen(pkc)) {
1051 * Check if that last block which caused the queue to freeze,
1052 * is still in_use by user-space.
1054 if (prb_curr_blk_in_use(pkc, pbd)) {
1055 /* Can't record this packet */
1059 * Ok, the block was released by user-space.
1060 * Now let's open that block.
1061 * opening a block also thaws the queue.
1062 * Thawing is a side effect.
1064 prb_open_block(pkc, pbd);
1069 curr = pkc->nxt_offset;
1071 end = (char *)pbd + pkc->kblk_size;
1073 /* first try the current block */
1074 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1075 prb_fill_curr_block(curr, pkc, pbd, len);
1076 return (void *)curr;
1079 /* Ok, close the current block */
1080 prb_retire_current_block(pkc, po, 0);
1082 /* Now, try to dispatch the next block */
1083 curr = (char *)prb_dispatch_next_block(pkc, po);
1085 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1086 prb_fill_curr_block(curr, pkc, pbd, len);
1087 return (void *)curr;
1091 * No free blocks are available.user_space hasn't caught up yet.
1092 * Queue was just frozen and now this packet will get dropped.
1097 static void *packet_current_rx_frame(struct packet_sock *po,
1098 struct sk_buff *skb,
1099 int status, unsigned int len)
1102 switch (po->tp_version) {
1105 curr = packet_lookup_frame(po, &po->rx_ring,
1106 po->rx_ring.head, status);
1109 return __packet_lookup_frame_in_block(po, skb, status, len);
1111 WARN(1, "TPACKET version not supported\n");
1117 static void *prb_lookup_block(struct packet_sock *po,
1118 struct packet_ring_buffer *rb,
1122 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1123 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1125 if (status != BLOCK_STATUS(pbd))
1130 static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1133 if (rb->prb_bdqc.kactive_blk_num)
1134 prev = rb->prb_bdqc.kactive_blk_num-1;
1136 prev = rb->prb_bdqc.knum_blocks-1;
1140 /* Assumes caller has held the rx_queue.lock */
1141 static void *__prb_previous_block(struct packet_sock *po,
1142 struct packet_ring_buffer *rb,
1145 unsigned int previous = prb_previous_blk_num(rb);
1146 return prb_lookup_block(po, rb, previous, status);
1149 static void *packet_previous_rx_frame(struct packet_sock *po,
1150 struct packet_ring_buffer *rb,
1153 if (po->tp_version <= TPACKET_V2)
1154 return packet_previous_frame(po, rb, status);
1156 return __prb_previous_block(po, rb, status);
1159 static void packet_increment_rx_head(struct packet_sock *po,
1160 struct packet_ring_buffer *rb)
1162 switch (po->tp_version) {
1165 return packet_increment_head(rb);
1168 WARN(1, "TPACKET version not supported.\n");
1174 static void *packet_previous_frame(struct packet_sock *po,
1175 struct packet_ring_buffer *rb,
1178 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1179 return packet_lookup_frame(po, rb, previous, status);
1182 static void packet_increment_head(struct packet_ring_buffer *buff)
1184 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1187 static void packet_inc_pending(struct packet_ring_buffer *rb)
1189 this_cpu_inc(*rb->pending_refcnt);
1192 static void packet_dec_pending(struct packet_ring_buffer *rb)
1194 this_cpu_dec(*rb->pending_refcnt);
1197 static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1199 unsigned int refcnt = 0;
1202 /* We don't use pending refcount in rx_ring. */
1203 if (rb->pending_refcnt == NULL)
1206 for_each_possible_cpu(cpu)
1207 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1212 static int packet_alloc_pending(struct packet_sock *po)
1214 po->rx_ring.pending_refcnt = NULL;
1216 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1217 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 static void packet_free_pending(struct packet_sock *po)
1225 free_percpu(po->tx_ring.pending_refcnt);
1228 static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1230 struct sock *sk = &po->sk;
1233 if (po->prot_hook.func != tpacket_rcv)
1234 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1237 spin_lock(&sk->sk_receive_queue.lock);
1238 if (po->tp_version == TPACKET_V3)
1239 has_room = prb_lookup_block(po, &po->rx_ring,
1240 po->rx_ring.prb_bdqc.kactive_blk_num,
1243 has_room = packet_lookup_frame(po, &po->rx_ring,
1246 spin_unlock(&sk->sk_receive_queue.lock);
1251 static void packet_sock_destruct(struct sock *sk)
1253 skb_queue_purge(&sk->sk_error_queue);
1255 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1256 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1258 if (!sock_flag(sk, SOCK_DEAD)) {
1259 pr_err("Attempt to release alive packet socket: %p\n", sk);
1263 sk_refcnt_debug_dec(sk);
1266 static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1268 int x = atomic_read(&f->rr_cur) + 1;
1276 static unsigned int fanout_demux_hash(struct packet_fanout *f,
1277 struct sk_buff *skb,
1280 return reciprocal_scale(skb_get_hash(skb), num);
1283 static unsigned int fanout_demux_lb(struct packet_fanout *f,
1284 struct sk_buff *skb,
1289 cur = atomic_read(&f->rr_cur);
1290 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1291 fanout_rr_next(f, num))) != cur)
1296 static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1297 struct sk_buff *skb,
1300 return smp_processor_id() % num;
1303 static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1304 struct sk_buff *skb,
1307 return prandom_u32_max(num);
1310 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1311 struct sk_buff *skb,
1312 unsigned int idx, unsigned int skip,
1317 i = j = min_t(int, f->next[idx], num - 1);
1319 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1331 static unsigned int fanout_demux_qm(struct packet_fanout *f,
1332 struct sk_buff *skb,
1335 return skb_get_queue_mapping(skb) % num;
1338 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1340 return f->flags & (flag >> 8);
1343 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1344 struct packet_type *pt, struct net_device *orig_dev)
1346 struct packet_fanout *f = pt->af_packet_priv;
1347 unsigned int num = f->num_members;
1348 struct packet_sock *po;
1351 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1358 case PACKET_FANOUT_HASH:
1360 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1361 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1365 idx = fanout_demux_hash(f, skb, num);
1367 case PACKET_FANOUT_LB:
1368 idx = fanout_demux_lb(f, skb, num);
1370 case PACKET_FANOUT_CPU:
1371 idx = fanout_demux_cpu(f, skb, num);
1373 case PACKET_FANOUT_RND:
1374 idx = fanout_demux_rnd(f, skb, num);
1376 case PACKET_FANOUT_QM:
1377 idx = fanout_demux_qm(f, skb, num);
1379 case PACKET_FANOUT_ROLLOVER:
1380 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
1384 po = pkt_sk(f->arr[idx]);
1385 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1386 unlikely(!packet_rcv_has_room(po, skb))) {
1387 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1388 po = pkt_sk(f->arr[idx]);
1391 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1394 DEFINE_MUTEX(fanout_mutex);
1395 EXPORT_SYMBOL_GPL(fanout_mutex);
1396 static LIST_HEAD(fanout_list);
1398 static void __fanout_link(struct sock *sk, struct packet_sock *po)
1400 struct packet_fanout *f = po->fanout;
1402 spin_lock(&f->lock);
1403 f->arr[f->num_members] = sk;
1406 spin_unlock(&f->lock);
1409 static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1411 struct packet_fanout *f = po->fanout;
1414 spin_lock(&f->lock);
1415 for (i = 0; i < f->num_members; i++) {
1416 if (f->arr[i] == sk)
1419 BUG_ON(i >= f->num_members);
1420 f->arr[i] = f->arr[f->num_members - 1];
1422 spin_unlock(&f->lock);
1425 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1427 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
1433 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1435 struct packet_sock *po = pkt_sk(sk);
1436 struct packet_fanout *f, *match;
1437 u8 type = type_flags & 0xff;
1438 u8 flags = type_flags >> 8;
1442 case PACKET_FANOUT_ROLLOVER:
1443 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1445 case PACKET_FANOUT_HASH:
1446 case PACKET_FANOUT_LB:
1447 case PACKET_FANOUT_CPU:
1448 case PACKET_FANOUT_RND:
1449 case PACKET_FANOUT_QM:
1461 mutex_lock(&fanout_mutex);
1463 list_for_each_entry(f, &fanout_list, list) {
1465 read_pnet(&f->net) == sock_net(sk)) {
1471 if (match && match->flags != flags)
1475 match = kzalloc(sizeof(*match), GFP_KERNEL);
1478 write_pnet(&match->net, sock_net(sk));
1481 match->flags = flags;
1482 atomic_set(&match->rr_cur, 0);
1483 INIT_LIST_HEAD(&match->list);
1484 spin_lock_init(&match->lock);
1485 atomic_set(&match->sk_ref, 0);
1486 match->prot_hook.type = po->prot_hook.type;
1487 match->prot_hook.dev = po->prot_hook.dev;
1488 match->prot_hook.func = packet_rcv_fanout;
1489 match->prot_hook.af_packet_priv = match;
1490 match->prot_hook.id_match = match_fanout_group;
1491 dev_add_pack(&match->prot_hook);
1492 list_add(&match->list, &fanout_list);
1495 if (match->type == type &&
1496 match->prot_hook.type == po->prot_hook.type &&
1497 match->prot_hook.dev == po->prot_hook.dev) {
1499 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1500 __dev_remove_pack(&po->prot_hook);
1502 atomic_inc(&match->sk_ref);
1503 __fanout_link(sk, po);
1508 mutex_unlock(&fanout_mutex);
1512 static void fanout_release(struct sock *sk)
1514 struct packet_sock *po = pkt_sk(sk);
1515 struct packet_fanout *f;
1521 mutex_lock(&fanout_mutex);
1524 if (atomic_dec_and_test(&f->sk_ref)) {
1526 dev_remove_pack(&f->prot_hook);
1529 mutex_unlock(&fanout_mutex);
1532 static const struct proto_ops packet_ops;
1534 static const struct proto_ops packet_ops_spkt;
1536 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1537 struct packet_type *pt, struct net_device *orig_dev)
1540 struct sockaddr_pkt *spkt;
1543 * When we registered the protocol we saved the socket in the data
1544 * field for just this event.
1547 sk = pt->af_packet_priv;
1550 * Yank back the headers [hope the device set this
1551 * right or kerboom...]
1553 * Incoming packets have ll header pulled,
1556 * For outgoing ones skb->data == skb_mac_header(skb)
1557 * so that this procedure is noop.
1560 if (skb->pkt_type == PACKET_LOOPBACK)
1563 if (!net_eq(dev_net(dev), sock_net(sk)))
1566 skb = skb_share_check(skb, GFP_ATOMIC);
1570 /* drop any routing info */
1573 /* drop conntrack reference */
1576 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1578 skb_push(skb, skb->data - skb_mac_header(skb));
1581 * The SOCK_PACKET socket receives _all_ frames.
1584 spkt->spkt_family = dev->type;
1585 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1586 spkt->spkt_protocol = skb->protocol;
1589 * Charge the memory to the socket. This is done specifically
1590 * to prevent sockets using all the memory up.
1593 if (sock_queue_rcv_skb(sk, skb) == 0)
1604 * Output a raw packet to a device layer. This bypasses all the other
1605 * protocol layers and you must therefore supply it with a complete frame
1608 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1609 struct msghdr *msg, size_t len)
1611 struct sock *sk = sock->sk;
1612 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1613 struct sk_buff *skb = NULL;
1614 struct net_device *dev;
1620 * Get and verify the address.
1624 if (msg->msg_namelen < sizeof(struct sockaddr))
1626 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1627 proto = saddr->spkt_protocol;
1629 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1632 * Find the device first to size check it
1635 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1638 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1644 if (!(dev->flags & IFF_UP))
1648 * You may not queue a frame bigger than the mtu. This is the lowest level
1649 * raw protocol and you must do your own fragmentation at this level.
1652 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1653 if (!netif_supports_nofcs(dev)) {
1654 err = -EPROTONOSUPPORT;
1657 extra_len = 4; /* We're doing our own CRC */
1661 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1665 size_t reserved = LL_RESERVED_SPACE(dev);
1666 int tlen = dev->needed_tailroom;
1667 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1670 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1673 /* FIXME: Save some space for broken drivers that write a hard
1674 * header at transmission time by themselves. PPP is the notable
1675 * one here. This should really be fixed at the driver level.
1677 skb_reserve(skb, reserved);
1678 skb_reset_network_header(skb);
1680 /* Try to align data part correctly */
1685 skb_reset_network_header(skb);
1687 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1693 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
1694 /* Earlier code assumed this would be a VLAN pkt,
1695 * double-check this now that we have the actual
1698 struct ethhdr *ehdr;
1699 skb_reset_mac_header(skb);
1700 ehdr = eth_hdr(skb);
1701 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1707 skb->protocol = proto;
1709 skb->priority = sk->sk_priority;
1710 skb->mark = sk->sk_mark;
1712 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1714 if (unlikely(extra_len == 4))
1717 skb_probe_transport_header(skb, 0);
1719 dev_queue_xmit(skb);
1730 static unsigned int run_filter(const struct sk_buff *skb,
1731 const struct sock *sk,
1734 struct sk_filter *filter;
1737 filter = rcu_dereference(sk->sk_filter);
1739 res = SK_RUN_FILTER(filter, skb);
1746 * This function makes lazy skb cloning in hope that most of packets
1747 * are discarded by BPF.
1749 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1750 * and skb->cb are mangled. It works because (and until) packets
1751 * falling here are owned by current CPU. Output packets are cloned
1752 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1753 * sequencially, so that if we return skb to original state on exit,
1754 * we will not harm anyone.
1757 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1758 struct packet_type *pt, struct net_device *orig_dev)
1761 struct sockaddr_ll *sll;
1762 struct packet_sock *po;
1763 u8 *skb_head = skb->data;
1764 int skb_len = skb->len;
1765 unsigned int snaplen, res;
1767 if (skb->pkt_type == PACKET_LOOPBACK)
1770 sk = pt->af_packet_priv;
1773 if (!net_eq(dev_net(dev), sock_net(sk)))
1778 if (dev->header_ops) {
1779 /* The device has an explicit notion of ll header,
1780 * exported to higher levels.
1782 * Otherwise, the device hides details of its frame
1783 * structure, so that corresponding packet head is
1784 * never delivered to user.
1786 if (sk->sk_type != SOCK_DGRAM)
1787 skb_push(skb, skb->data - skb_mac_header(skb));
1788 else if (skb->pkt_type == PACKET_OUTGOING) {
1789 /* Special case: outgoing packets have ll header at head */
1790 skb_pull(skb, skb_network_offset(skb));
1796 res = run_filter(skb, sk, snaplen);
1798 goto drop_n_restore;
1802 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1805 if (skb_shared(skb)) {
1806 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1810 if (skb_head != skb->data) {
1811 skb->data = skb_head;
1818 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1821 sll = &PACKET_SKB_CB(skb)->sa.ll;
1822 sll->sll_family = AF_PACKET;
1823 sll->sll_hatype = dev->type;
1824 sll->sll_protocol = skb->protocol;
1825 sll->sll_pkttype = skb->pkt_type;
1826 if (unlikely(po->origdev))
1827 sll->sll_ifindex = orig_dev->ifindex;
1829 sll->sll_ifindex = dev->ifindex;
1831 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1833 PACKET_SKB_CB(skb)->origlen = skb->len;
1835 if (pskb_trim(skb, snaplen))
1838 skb_set_owner_r(skb, sk);
1842 /* drop conntrack reference */
1845 spin_lock(&sk->sk_receive_queue.lock);
1846 po->stats.stats1.tp_packets++;
1847 skb->dropcount = atomic_read(&sk->sk_drops);
1848 __skb_queue_tail(&sk->sk_receive_queue, skb);
1849 spin_unlock(&sk->sk_receive_queue.lock);
1850 sk->sk_data_ready(sk, skb->len);
1854 spin_lock(&sk->sk_receive_queue.lock);
1855 po->stats.stats1.tp_drops++;
1856 atomic_inc(&sk->sk_drops);
1857 spin_unlock(&sk->sk_receive_queue.lock);
1860 if (skb_head != skb->data && skb_shared(skb)) {
1861 skb->data = skb_head;
1869 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1870 struct packet_type *pt, struct net_device *orig_dev)
1873 struct packet_sock *po;
1874 struct sockaddr_ll *sll;
1875 union tpacket_uhdr h;
1876 u8 *skb_head = skb->data;
1877 int skb_len = skb->len;
1878 unsigned int snaplen, res;
1879 unsigned long status = TP_STATUS_USER;
1880 unsigned short macoff, netoff, hdrlen;
1881 struct sk_buff *copy_skb = NULL;
1885 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1886 * We may add members to them until current aligned size without forcing
1887 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1889 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1890 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1892 if (skb->pkt_type == PACKET_LOOPBACK)
1895 sk = pt->af_packet_priv;
1898 if (!net_eq(dev_net(dev), sock_net(sk)))
1901 if (dev->header_ops) {
1902 if (sk->sk_type != SOCK_DGRAM)
1903 skb_push(skb, skb->data - skb_mac_header(skb));
1904 else if (skb->pkt_type == PACKET_OUTGOING) {
1905 /* Special case: outgoing packets have ll header at head */
1906 skb_pull(skb, skb_network_offset(skb));
1910 if (skb->ip_summed == CHECKSUM_PARTIAL)
1911 status |= TP_STATUS_CSUMNOTREADY;
1915 res = run_filter(skb, sk, snaplen);
1917 goto drop_n_restore;
1921 if (sk->sk_type == SOCK_DGRAM) {
1922 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1925 unsigned int maclen = skb_network_offset(skb);
1926 netoff = TPACKET_ALIGN(po->tp_hdrlen +
1927 (maclen < 16 ? 16 : maclen)) +
1929 macoff = netoff - maclen;
1931 if (po->tp_version <= TPACKET_V2) {
1932 if (macoff + snaplen > po->rx_ring.frame_size) {
1933 if (po->copy_thresh &&
1934 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1935 if (skb_shared(skb)) {
1936 copy_skb = skb_clone(skb, GFP_ATOMIC);
1938 copy_skb = skb_get(skb);
1939 skb_head = skb->data;
1942 skb_set_owner_r(copy_skb, sk);
1944 snaplen = po->rx_ring.frame_size - macoff;
1945 if ((int)snaplen < 0)
1949 spin_lock(&sk->sk_receive_queue.lock);
1950 h.raw = packet_current_rx_frame(po, skb,
1951 TP_STATUS_KERNEL, (macoff+snaplen));
1954 if (po->tp_version <= TPACKET_V2) {
1955 packet_increment_rx_head(po, &po->rx_ring);
1957 * LOSING will be reported till you read the stats,
1958 * because it's COR - Clear On Read.
1959 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1962 if (po->stats.stats1.tp_drops)
1963 status |= TP_STATUS_LOSING;
1965 po->stats.stats1.tp_packets++;
1967 status |= TP_STATUS_COPY;
1968 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1970 spin_unlock(&sk->sk_receive_queue.lock);
1972 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1974 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
1975 getnstimeofday(&ts);
1977 status |= ts_status;
1979 switch (po->tp_version) {
1981 h.h1->tp_len = skb->len;
1982 h.h1->tp_snaplen = snaplen;
1983 h.h1->tp_mac = macoff;
1984 h.h1->tp_net = netoff;
1985 h.h1->tp_sec = ts.tv_sec;
1986 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
1987 hdrlen = sizeof(*h.h1);
1990 h.h2->tp_len = skb->len;
1991 h.h2->tp_snaplen = snaplen;
1992 h.h2->tp_mac = macoff;
1993 h.h2->tp_net = netoff;
1994 h.h2->tp_sec = ts.tv_sec;
1995 h.h2->tp_nsec = ts.tv_nsec;
1996 if (vlan_tx_tag_present(skb)) {
1997 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1998 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
1999 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2001 h.h2->tp_vlan_tci = 0;
2002 h.h2->tp_vlan_tpid = 0;
2004 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2005 hdrlen = sizeof(*h.h2);
2008 /* tp_nxt_offset,vlan are already populated above.
2009 * So DONT clear those fields here
2011 h.h3->tp_status |= status;
2012 h.h3->tp_len = skb->len;
2013 h.h3->tp_snaplen = snaplen;
2014 h.h3->tp_mac = macoff;
2015 h.h3->tp_net = netoff;
2016 h.h3->tp_sec = ts.tv_sec;
2017 h.h3->tp_nsec = ts.tv_nsec;
2018 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2019 hdrlen = sizeof(*h.h3);
2025 sll = h.raw + TPACKET_ALIGN(hdrlen);
2026 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2027 sll->sll_family = AF_PACKET;
2028 sll->sll_hatype = dev->type;
2029 sll->sll_protocol = skb->protocol;
2030 sll->sll_pkttype = skb->pkt_type;
2031 if (unlikely(po->origdev))
2032 sll->sll_ifindex = orig_dev->ifindex;
2034 sll->sll_ifindex = dev->ifindex;
2038 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2039 if (po->tp_version <= TPACKET_V2) {
2042 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2045 for (start = h.raw; start < end; start += PAGE_SIZE)
2046 flush_dcache_page(pgv_to_page(start));
2051 if (po->tp_version <= TPACKET_V2)
2052 __packet_set_status(po, h.raw, status);
2054 prb_clear_blk_fill_status(&po->rx_ring);
2056 sk->sk_data_ready(sk, 0);
2059 if (skb_head != skb->data && skb_shared(skb)) {
2060 skb->data = skb_head;
2068 po->stats.stats1.tp_drops++;
2069 spin_unlock(&sk->sk_receive_queue.lock);
2071 sk->sk_data_ready(sk, 0);
2072 kfree_skb(copy_skb);
2073 goto drop_n_restore;
2076 static void tpacket_destruct_skb(struct sk_buff *skb)
2078 struct packet_sock *po = pkt_sk(skb->sk);
2080 if (likely(po->tx_ring.pg_vec)) {
2084 ph = skb_shinfo(skb)->destructor_arg;
2085 packet_dec_pending(&po->tx_ring);
2087 ts = __packet_set_timestamp(po, ph, skb);
2088 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2094 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2095 void *frame, struct net_device *dev, int size_max,
2096 __be16 proto, unsigned char *addr, int hlen)
2098 union tpacket_uhdr ph;
2099 int to_write, offset, len, tp_len, nr_frags, len_max;
2100 struct socket *sock = po->sk.sk_socket;
2107 skb->protocol = proto;
2109 skb->priority = po->sk.sk_priority;
2110 skb->mark = po->sk.sk_mark;
2111 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
2112 skb_shinfo(skb)->destructor_arg = ph.raw;
2114 switch (po->tp_version) {
2116 tp_len = ph.h2->tp_len;
2119 tp_len = ph.h1->tp_len;
2122 if (unlikely(tp_len > size_max)) {
2123 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2127 skb_reserve(skb, hlen);
2128 skb_reset_network_header(skb);
2130 if (!packet_use_direct_xmit(po))
2131 skb_probe_transport_header(skb, 0);
2132 if (unlikely(po->tp_tx_has_off)) {
2133 int off_min, off_max, off;
2134 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2135 off_max = po->tx_ring.frame_size - tp_len;
2136 if (sock->type == SOCK_DGRAM) {
2137 switch (po->tp_version) {
2139 off = ph.h2->tp_net;
2142 off = ph.h1->tp_net;
2146 switch (po->tp_version) {
2148 off = ph.h2->tp_mac;
2151 off = ph.h1->tp_mac;
2155 if (unlikely((off < off_min) || (off_max < off)))
2157 data = ph.raw + off;
2159 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2163 if (sock->type == SOCK_DGRAM) {
2164 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2166 if (unlikely(err < 0))
2168 } else if (dev->hard_header_len) {
2169 /* net device doesn't like empty head */
2170 if (unlikely(tp_len <= dev->hard_header_len)) {
2171 pr_err("packet size is too short (%d < %d)\n",
2172 tp_len, dev->hard_header_len);
2176 skb_push(skb, dev->hard_header_len);
2177 err = skb_store_bits(skb, 0, data,
2178 dev->hard_header_len);
2182 data += dev->hard_header_len;
2183 to_write -= dev->hard_header_len;
2186 offset = offset_in_page(data);
2187 len_max = PAGE_SIZE - offset;
2188 len = ((to_write > len_max) ? len_max : to_write);
2190 skb->data_len = to_write;
2191 skb->len += to_write;
2192 skb->truesize += to_write;
2193 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2195 while (likely(to_write)) {
2196 nr_frags = skb_shinfo(skb)->nr_frags;
2198 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2199 pr_err("Packet exceed the number of skb frags(%lu)\n",
2204 page = pgv_to_page(data);
2206 flush_dcache_page(page);
2208 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2211 len_max = PAGE_SIZE;
2212 len = ((to_write > len_max) ? len_max : to_write);
2218 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2220 struct sk_buff *skb;
2221 struct net_device *dev;
2223 int err, reserve = 0;
2225 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2226 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2227 int tp_len, size_max;
2228 unsigned char *addr;
2230 int status = TP_STATUS_AVAILABLE;
2233 mutex_lock(&po->pg_vec_lock);
2235 if (likely(saddr == NULL)) {
2236 dev = packet_cached_dev_get(po);
2241 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2243 if (msg->msg_namelen < (saddr->sll_halen
2244 + offsetof(struct sockaddr_ll,
2247 proto = saddr->sll_protocol;
2248 addr = saddr->sll_addr;
2249 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2253 if (unlikely(dev == NULL))
2256 if (unlikely(!(dev->flags & IFF_UP)))
2259 reserve = dev->hard_header_len + VLAN_HLEN;
2260 size_max = po->tx_ring.frame_size
2261 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2263 if (size_max > dev->mtu + reserve)
2264 size_max = dev->mtu + reserve;
2267 ph = packet_current_frame(po, &po->tx_ring,
2268 TP_STATUS_SEND_REQUEST);
2269 if (unlikely(ph == NULL)) {
2270 if (need_wait && need_resched())
2275 status = TP_STATUS_SEND_REQUEST;
2276 hlen = LL_RESERVED_SPACE(dev);
2277 tlen = dev->needed_tailroom;
2278 skb = sock_alloc_send_skb(&po->sk,
2279 hlen + tlen + sizeof(struct sockaddr_ll),
2282 if (unlikely(skb == NULL))
2285 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
2287 if (tp_len > dev->mtu + dev->hard_header_len) {
2288 struct ethhdr *ehdr;
2289 /* Earlier code assumed this would be a VLAN pkt,
2290 * double-check this now that we have the actual
2294 skb_reset_mac_header(skb);
2295 ehdr = eth_hdr(skb);
2296 if (ehdr->h_proto != htons(ETH_P_8021Q))
2299 if (unlikely(tp_len < 0)) {
2301 __packet_set_status(po, ph,
2302 TP_STATUS_AVAILABLE);
2303 packet_increment_head(&po->tx_ring);
2307 status = TP_STATUS_WRONG_FORMAT;
2313 packet_pick_tx_queue(dev, skb);
2315 skb->destructor = tpacket_destruct_skb;
2316 __packet_set_status(po, ph, TP_STATUS_SENDING);
2317 packet_inc_pending(&po->tx_ring);
2319 status = TP_STATUS_SEND_REQUEST;
2320 err = po->xmit(skb);
2321 if (unlikely(err > 0)) {
2322 err = net_xmit_errno(err);
2323 if (err && __packet_get_status(po, ph) ==
2324 TP_STATUS_AVAILABLE) {
2325 /* skb was destructed already */
2330 * skb was dropped but not destructed yet;
2331 * let's treat it like congestion or err < 0
2335 packet_increment_head(&po->tx_ring);
2337 } while (likely((ph != NULL) ||
2338 /* Note: packet_read_pending() might be slow if we have
2339 * to call it as it's per_cpu variable, but in fast-path
2340 * we already short-circuit the loop with the first
2341 * condition, and luckily don't have to go that path
2344 (need_wait && packet_read_pending(&po->tx_ring))));
2350 __packet_set_status(po, ph, status);
2355 mutex_unlock(&po->pg_vec_lock);
2359 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2360 size_t reserve, size_t len,
2361 size_t linear, int noblock,
2364 struct sk_buff *skb;
2366 /* Under a page? Don't bother with paged skb. */
2367 if (prepad + len < PAGE_SIZE || !linear)
2370 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2375 skb_reserve(skb, reserve);
2376 skb_put(skb, linear);
2377 skb->data_len = len - linear;
2378 skb->len += len - linear;
2383 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2385 struct sock *sk = sock->sk;
2386 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2387 struct sk_buff *skb;
2388 struct net_device *dev;
2390 unsigned char *addr;
2391 int err, reserve = 0;
2392 struct virtio_net_hdr vnet_hdr = { 0 };
2395 struct packet_sock *po = pkt_sk(sk);
2396 unsigned short gso_type = 0;
2401 * Get and verify the address.
2404 if (likely(saddr == NULL)) {
2405 dev = packet_cached_dev_get(po);
2410 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2412 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2414 proto = saddr->sll_protocol;
2415 addr = saddr->sll_addr;
2416 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2420 if (unlikely(dev == NULL))
2423 if (unlikely(!(dev->flags & IFF_UP)))
2426 if (sock->type == SOCK_RAW)
2427 reserve = dev->hard_header_len;
2428 if (po->has_vnet_hdr) {
2429 vnet_hdr_len = sizeof(vnet_hdr);
2432 if (len < vnet_hdr_len)
2435 len -= vnet_hdr_len;
2437 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2442 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2443 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2445 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2446 vnet_hdr.csum_offset + 2;
2449 if (vnet_hdr.hdr_len > len)
2452 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2453 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2454 case VIRTIO_NET_HDR_GSO_TCPV4:
2455 gso_type = SKB_GSO_TCPV4;
2457 case VIRTIO_NET_HDR_GSO_TCPV6:
2458 gso_type = SKB_GSO_TCPV6;
2460 case VIRTIO_NET_HDR_GSO_UDP:
2461 gso_type = SKB_GSO_UDP;
2467 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2468 gso_type |= SKB_GSO_TCP_ECN;
2470 if (vnet_hdr.gso_size == 0)
2476 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2477 if (!netif_supports_nofcs(dev)) {
2478 err = -EPROTONOSUPPORT;
2481 extra_len = 4; /* We're doing our own CRC */
2485 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2489 hlen = LL_RESERVED_SPACE(dev);
2490 tlen = dev->needed_tailroom;
2491 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
2492 msg->msg_flags & MSG_DONTWAIT, &err);
2496 skb_set_network_header(skb, reserve);
2499 if (sock->type == SOCK_DGRAM &&
2500 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
2503 /* Returns -EFAULT on error */
2504 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
2508 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
2510 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
2511 /* Earlier code assumed this would be a VLAN pkt,
2512 * double-check this now that we have the actual
2515 struct ethhdr *ehdr;
2516 skb_reset_mac_header(skb);
2517 ehdr = eth_hdr(skb);
2518 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2524 skb->protocol = proto;
2526 skb->priority = sk->sk_priority;
2527 skb->mark = sk->sk_mark;
2529 packet_pick_tx_queue(dev, skb);
2531 if (po->has_vnet_hdr) {
2532 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2533 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2534 vnet_hdr.csum_offset)) {
2540 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2541 skb_shinfo(skb)->gso_type = gso_type;
2543 /* Header must be checked, and gso_segs computed. */
2544 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2545 skb_shinfo(skb)->gso_segs = 0;
2547 len += vnet_hdr_len;
2550 if (!packet_use_direct_xmit(po))
2551 skb_probe_transport_header(skb, reserve);
2552 if (unlikely(extra_len == 4))
2555 err = po->xmit(skb);
2556 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2572 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2573 struct msghdr *msg, size_t len)
2575 struct sock *sk = sock->sk;
2576 struct packet_sock *po = pkt_sk(sk);
2578 if (po->tx_ring.pg_vec)
2579 return tpacket_snd(po, msg);
2581 return packet_snd(sock, msg, len);
2585 * Close a PACKET socket. This is fairly simple. We immediately go
2586 * to 'closed' state and remove our protocol entry in the device list.
2589 static int packet_release(struct socket *sock)
2591 struct sock *sk = sock->sk;
2592 struct packet_sock *po;
2594 union tpacket_req_u req_u;
2602 mutex_lock(&net->packet.sklist_lock);
2603 sk_del_node_init_rcu(sk);
2604 mutex_unlock(&net->packet.sklist_lock);
2607 sock_prot_inuse_add(net, sk->sk_prot, -1);
2610 spin_lock(&po->bind_lock);
2611 unregister_prot_hook(sk, false);
2612 packet_cached_dev_reset(po);
2614 if (po->prot_hook.dev) {
2615 dev_put(po->prot_hook.dev);
2616 po->prot_hook.dev = NULL;
2618 spin_unlock(&po->bind_lock);
2620 packet_flush_mclist(sk);
2622 if (po->rx_ring.pg_vec) {
2623 memset(&req_u, 0, sizeof(req_u));
2624 packet_set_ring(sk, &req_u, 1, 0);
2627 if (po->tx_ring.pg_vec) {
2628 memset(&req_u, 0, sizeof(req_u));
2629 packet_set_ring(sk, &req_u, 1, 1);
2636 * Now the socket is dead. No more input will appear.
2643 skb_queue_purge(&sk->sk_receive_queue);
2644 packet_free_pending(po);
2645 sk_refcnt_debug_release(sk);
2652 * Attach a packet hook.
2655 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
2657 struct packet_sock *po = pkt_sk(sk);
2658 const struct net_device *dev_curr;
2670 spin_lock(&po->bind_lock);
2672 proto_curr = po->prot_hook.type;
2673 dev_curr = po->prot_hook.dev;
2675 need_rehook = proto_curr != proto || dev_curr != dev;
2678 unregister_prot_hook(sk, true);
2681 po->prot_hook.type = proto;
2683 if (po->prot_hook.dev)
2684 dev_put(po->prot_hook.dev);
2686 po->prot_hook.dev = dev;
2688 po->ifindex = dev ? dev->ifindex : 0;
2689 packet_cached_dev_assign(po, dev);
2692 if (proto == 0 || !need_rehook)
2695 if (!dev || (dev->flags & IFF_UP)) {
2696 register_prot_hook(sk);
2698 sk->sk_err = ENETDOWN;
2699 if (!sock_flag(sk, SOCK_DEAD))
2700 sk->sk_error_report(sk);
2704 spin_unlock(&po->bind_lock);
2710 * Bind a packet socket to a device
2713 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2716 struct sock *sk = sock->sk;
2718 struct net_device *dev;
2725 if (addr_len != sizeof(struct sockaddr))
2727 strlcpy(name, uaddr->sa_data, sizeof(name));
2729 dev = dev_get_by_name(sock_net(sk), name);
2731 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
2735 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2737 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2738 struct sock *sk = sock->sk;
2739 struct net_device *dev = NULL;
2747 if (addr_len < sizeof(struct sockaddr_ll))
2749 if (sll->sll_family != AF_PACKET)
2752 if (sll->sll_ifindex) {
2754 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
2758 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
2764 static struct proto packet_proto = {
2766 .owner = THIS_MODULE,
2767 .obj_size = sizeof(struct packet_sock),
2771 * Create a packet of type SOCK_PACKET.
2774 static int packet_create(struct net *net, struct socket *sock, int protocol,
2778 struct packet_sock *po;
2779 __be16 proto = (__force __be16)protocol; /* weird, but documented */
2782 if (!ns_capable(net->user_ns, CAP_NET_RAW))
2784 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2785 sock->type != SOCK_PACKET)
2786 return -ESOCKTNOSUPPORT;
2788 sock->state = SS_UNCONNECTED;
2791 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
2795 sock->ops = &packet_ops;
2796 if (sock->type == SOCK_PACKET)
2797 sock->ops = &packet_ops_spkt;
2799 sock_init_data(sock, sk);
2802 sk->sk_family = PF_PACKET;
2804 po->xmit = dev_queue_xmit;
2806 err = packet_alloc_pending(po);
2810 packet_cached_dev_reset(po);
2812 sk->sk_destruct = packet_sock_destruct;
2813 sk_refcnt_debug_inc(sk);
2816 * Attach a protocol block
2819 spin_lock_init(&po->bind_lock);
2820 mutex_init(&po->pg_vec_lock);
2821 po->prot_hook.func = packet_rcv;
2823 if (sock->type == SOCK_PACKET)
2824 po->prot_hook.func = packet_rcv_spkt;
2826 po->prot_hook.af_packet_priv = sk;
2829 po->prot_hook.type = proto;
2830 register_prot_hook(sk);
2833 mutex_lock(&net->packet.sklist_lock);
2834 sk_add_node_rcu(sk, &net->packet.sklist);
2835 mutex_unlock(&net->packet.sklist_lock);
2838 sock_prot_inuse_add(net, &packet_proto, 1);
2849 * Pull a packet from our receive queue and hand it to the user.
2850 * If necessary we block.
2853 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2854 struct msghdr *msg, size_t len, int flags)
2856 struct sock *sk = sock->sk;
2857 struct sk_buff *skb;
2859 int vnet_hdr_len = 0;
2862 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
2866 /* What error should we return now? EUNATTACH? */
2867 if (pkt_sk(sk)->ifindex < 0)
2871 if (flags & MSG_ERRQUEUE) {
2872 err = sock_recv_errqueue(sk, msg, len,
2873 SOL_PACKET, PACKET_TX_TIMESTAMP);
2878 * Call the generic datagram receiver. This handles all sorts
2879 * of horrible races and re-entrancy so we can forget about it
2880 * in the protocol layers.
2882 * Now it will return ENETDOWN, if device have just gone down,
2883 * but then it will block.
2886 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
2889 * An error occurred so return it. Because skb_recv_datagram()
2890 * handles the blocking we don't see and worry about blocking
2897 if (pkt_sk(sk)->has_vnet_hdr) {
2898 struct virtio_net_hdr vnet_hdr = { 0 };
2901 vnet_hdr_len = sizeof(vnet_hdr);
2902 if (len < vnet_hdr_len)
2905 len -= vnet_hdr_len;
2907 if (skb_is_gso(skb)) {
2908 struct skb_shared_info *sinfo = skb_shinfo(skb);
2910 /* This is a hint as to how much should be linear. */
2911 vnet_hdr.hdr_len = skb_headlen(skb);
2912 vnet_hdr.gso_size = sinfo->gso_size;
2913 if (sinfo->gso_type & SKB_GSO_TCPV4)
2914 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2915 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2916 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2917 else if (sinfo->gso_type & SKB_GSO_UDP)
2918 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2919 else if (sinfo->gso_type & SKB_GSO_FCOE)
2923 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2924 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2926 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2928 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2929 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
2930 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
2931 vnet_hdr.csum_offset = skb->csum_offset;
2932 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2933 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
2934 } /* else everything is zero */
2936 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2942 /* You lose any data beyond the buffer you gave. If it worries
2943 * a user program they can ask the device for its MTU
2949 msg->msg_flags |= MSG_TRUNC;
2952 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2956 sock_recv_ts_and_drops(msg, sk, skb);
2958 if (msg->msg_name) {
2959 /* If the address length field is there to be filled
2960 * in, we fill it in now.
2962 if (sock->type == SOCK_PACKET) {
2963 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
2964 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2966 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2967 msg->msg_namelen = sll->sll_halen +
2968 offsetof(struct sockaddr_ll, sll_addr);
2970 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2974 if (pkt_sk(sk)->auxdata) {
2975 struct tpacket_auxdata aux;
2977 aux.tp_status = TP_STATUS_USER;
2978 if (skb->ip_summed == CHECKSUM_PARTIAL)
2979 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2980 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2981 aux.tp_snaplen = skb->len;
2983 aux.tp_net = skb_network_offset(skb);
2984 if (vlan_tx_tag_present(skb)) {
2985 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2986 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
2987 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2989 aux.tp_vlan_tci = 0;
2990 aux.tp_vlan_tpid = 0;
2992 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
2996 * Free or return the buffer as appropriate. Again this
2997 * hides all the races and re-entrancy issues from us.
2999 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3002 skb_free_datagram(sk, skb);
3007 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3008 int *uaddr_len, int peer)
3010 struct net_device *dev;
3011 struct sock *sk = sock->sk;
3016 uaddr->sa_family = AF_PACKET;
3017 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3019 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3021 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3023 *uaddr_len = sizeof(*uaddr);
3028 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3029 int *uaddr_len, int peer)
3031 struct net_device *dev;
3032 struct sock *sk = sock->sk;
3033 struct packet_sock *po = pkt_sk(sk);
3034 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3039 sll->sll_family = AF_PACKET;
3040 sll->sll_ifindex = po->ifindex;
3041 sll->sll_protocol = po->num;
3042 sll->sll_pkttype = 0;
3044 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
3046 sll->sll_hatype = dev->type;
3047 sll->sll_halen = dev->addr_len;
3048 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3050 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3054 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3059 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3063 case PACKET_MR_MULTICAST:
3064 if (i->alen != dev->addr_len)
3067 return dev_mc_add(dev, i->addr);
3069 return dev_mc_del(dev, i->addr);
3071 case PACKET_MR_PROMISC:
3072 return dev_set_promiscuity(dev, what);
3074 case PACKET_MR_ALLMULTI:
3075 return dev_set_allmulti(dev, what);
3077 case PACKET_MR_UNICAST:
3078 if (i->alen != dev->addr_len)
3081 return dev_uc_add(dev, i->addr);
3083 return dev_uc_del(dev, i->addr);
3091 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
3093 for ( ; i; i = i->next) {
3094 if (i->ifindex == dev->ifindex)
3095 packet_dev_mc(dev, i, what);
3099 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3101 struct packet_sock *po = pkt_sk(sk);
3102 struct packet_mclist *ml, *i;
3103 struct net_device *dev;
3109 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3114 if (mreq->mr_alen > dev->addr_len)
3118 i = kmalloc(sizeof(*i), GFP_KERNEL);
3123 for (ml = po->mclist; ml; ml = ml->next) {
3124 if (ml->ifindex == mreq->mr_ifindex &&
3125 ml->type == mreq->mr_type &&
3126 ml->alen == mreq->mr_alen &&
3127 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3129 /* Free the new element ... */
3135 i->type = mreq->mr_type;
3136 i->ifindex = mreq->mr_ifindex;
3137 i->alen = mreq->mr_alen;
3138 memcpy(i->addr, mreq->mr_address, i->alen);
3140 i->next = po->mclist;
3142 err = packet_dev_mc(dev, i, 1);
3144 po->mclist = i->next;
3153 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3155 struct packet_mclist *ml, **mlp;
3159 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3160 if (ml->ifindex == mreq->mr_ifindex &&
3161 ml->type == mreq->mr_type &&
3162 ml->alen == mreq->mr_alen &&
3163 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3164 if (--ml->count == 0) {
3165 struct net_device *dev;
3167 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3169 packet_dev_mc(dev, ml, -1);
3177 return -EADDRNOTAVAIL;
3180 static void packet_flush_mclist(struct sock *sk)
3182 struct packet_sock *po = pkt_sk(sk);
3183 struct packet_mclist *ml;
3189 while ((ml = po->mclist) != NULL) {
3190 struct net_device *dev;
3192 po->mclist = ml->next;
3193 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3195 packet_dev_mc(dev, ml, -1);
3202 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3204 struct sock *sk = sock->sk;
3205 struct packet_sock *po = pkt_sk(sk);
3208 if (level != SOL_PACKET)
3209 return -ENOPROTOOPT;
3212 case PACKET_ADD_MEMBERSHIP:
3213 case PACKET_DROP_MEMBERSHIP:
3215 struct packet_mreq_max mreq;
3217 memset(&mreq, 0, sizeof(mreq));
3218 if (len < sizeof(struct packet_mreq))
3220 if (len > sizeof(mreq))
3222 if (copy_from_user(&mreq, optval, len))
3224 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3226 if (optname == PACKET_ADD_MEMBERSHIP)
3227 ret = packet_mc_add(sk, &mreq);
3229 ret = packet_mc_drop(sk, &mreq);
3233 case PACKET_RX_RING:
3234 case PACKET_TX_RING:
3236 union tpacket_req_u req_u;
3239 switch (po->tp_version) {
3242 len = sizeof(req_u.req);
3246 len = sizeof(req_u.req3);
3251 if (pkt_sk(sk)->has_vnet_hdr)
3253 if (copy_from_user(&req_u.req, optval, len))
3255 return packet_set_ring(sk, &req_u, 0,
3256 optname == PACKET_TX_RING);
3258 case PACKET_COPY_THRESH:
3262 if (optlen != sizeof(val))
3264 if (copy_from_user(&val, optval, sizeof(val)))
3267 pkt_sk(sk)->copy_thresh = val;
3270 case PACKET_VERSION:
3274 if (optlen != sizeof(val))
3276 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3278 if (copy_from_user(&val, optval, sizeof(val)))
3284 po->tp_version = val;
3290 case PACKET_RESERVE:
3294 if (optlen != sizeof(val))
3296 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3298 if (copy_from_user(&val, optval, sizeof(val)))
3300 po->tp_reserve = val;
3307 if (optlen != sizeof(val))
3309 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3311 if (copy_from_user(&val, optval, sizeof(val)))
3313 po->tp_loss = !!val;
3316 case PACKET_AUXDATA:
3320 if (optlen < sizeof(val))
3322 if (copy_from_user(&val, optval, sizeof(val)))
3325 po->auxdata = !!val;
3328 case PACKET_ORIGDEV:
3332 if (optlen < sizeof(val))
3334 if (copy_from_user(&val, optval, sizeof(val)))
3337 po->origdev = !!val;
3340 case PACKET_VNET_HDR:
3344 if (sock->type != SOCK_RAW)
3346 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3348 if (optlen < sizeof(val))
3350 if (copy_from_user(&val, optval, sizeof(val)))
3353 po->has_vnet_hdr = !!val;
3356 case PACKET_TIMESTAMP:
3360 if (optlen != sizeof(val))
3362 if (copy_from_user(&val, optval, sizeof(val)))
3365 po->tp_tstamp = val;
3372 if (optlen != sizeof(val))
3374 if (copy_from_user(&val, optval, sizeof(val)))
3377 return fanout_add(sk, val & 0xffff, val >> 16);
3379 case PACKET_TX_HAS_OFF:
3383 if (optlen != sizeof(val))
3385 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3387 if (copy_from_user(&val, optval, sizeof(val)))
3389 po->tp_tx_has_off = !!val;
3392 case PACKET_QDISC_BYPASS:
3396 if (optlen != sizeof(val))
3398 if (copy_from_user(&val, optval, sizeof(val)))
3401 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3405 return -ENOPROTOOPT;
3409 static int packet_getsockopt(struct socket *sock, int level, int optname,
3410 char __user *optval, int __user *optlen)
3413 int val, lv = sizeof(val);
3414 struct sock *sk = sock->sk;
3415 struct packet_sock *po = pkt_sk(sk);
3417 union tpacket_stats_u st;
3419 if (level != SOL_PACKET)
3420 return -ENOPROTOOPT;
3422 if (get_user(len, optlen))
3429 case PACKET_STATISTICS:
3430 spin_lock_bh(&sk->sk_receive_queue.lock);
3431 memcpy(&st, &po->stats, sizeof(st));
3432 memset(&po->stats, 0, sizeof(po->stats));
3433 spin_unlock_bh(&sk->sk_receive_queue.lock);
3435 if (po->tp_version == TPACKET_V3) {
3436 lv = sizeof(struct tpacket_stats_v3);
3437 st.stats3.tp_packets += st.stats3.tp_drops;
3440 lv = sizeof(struct tpacket_stats);
3441 st.stats1.tp_packets += st.stats1.tp_drops;
3446 case PACKET_AUXDATA:
3449 case PACKET_ORIGDEV:
3452 case PACKET_VNET_HDR:
3453 val = po->has_vnet_hdr;
3455 case PACKET_VERSION:
3456 val = po->tp_version;
3459 if (len > sizeof(int))
3461 if (copy_from_user(&val, optval, len))
3465 val = sizeof(struct tpacket_hdr);
3468 val = sizeof(struct tpacket2_hdr);
3471 val = sizeof(struct tpacket3_hdr);
3477 case PACKET_RESERVE:
3478 val = po->tp_reserve;
3483 case PACKET_TIMESTAMP:
3484 val = po->tp_tstamp;
3488 ((u32)po->fanout->id |
3489 ((u32)po->fanout->type << 16) |
3490 ((u32)po->fanout->flags << 24)) :
3493 case PACKET_TX_HAS_OFF:
3494 val = po->tp_tx_has_off;
3496 case PACKET_QDISC_BYPASS:
3497 val = packet_use_direct_xmit(po);
3500 return -ENOPROTOOPT;
3505 if (put_user(len, optlen))
3507 if (copy_to_user(optval, data, len))
3513 static int packet_notifier(struct notifier_block *this,
3514 unsigned long msg, void *ptr)
3517 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3518 struct net *net = dev_net(dev);
3521 sk_for_each_rcu(sk, &net->packet.sklist) {
3522 struct packet_sock *po = pkt_sk(sk);
3525 case NETDEV_UNREGISTER:
3527 packet_dev_mclist(dev, po->mclist, -1);
3531 if (dev->ifindex == po->ifindex) {
3532 spin_lock(&po->bind_lock);
3534 __unregister_prot_hook(sk, false);
3535 sk->sk_err = ENETDOWN;
3536 if (!sock_flag(sk, SOCK_DEAD))
3537 sk->sk_error_report(sk);
3539 if (msg == NETDEV_UNREGISTER) {
3540 packet_cached_dev_reset(po);
3542 if (po->prot_hook.dev)
3543 dev_put(po->prot_hook.dev);
3544 po->prot_hook.dev = NULL;
3546 spin_unlock(&po->bind_lock);
3550 if (dev->ifindex == po->ifindex) {
3551 spin_lock(&po->bind_lock);
3553 register_prot_hook(sk);
3554 spin_unlock(&po->bind_lock);
3564 static int packet_ioctl(struct socket *sock, unsigned int cmd,
3567 struct sock *sk = sock->sk;
3572 int amount = sk_wmem_alloc_get(sk);
3574 return put_user(amount, (int __user *)arg);
3578 struct sk_buff *skb;
3581 spin_lock_bh(&sk->sk_receive_queue.lock);
3582 skb = skb_peek(&sk->sk_receive_queue);
3585 spin_unlock_bh(&sk->sk_receive_queue.lock);
3586 return put_user(amount, (int __user *)arg);
3589 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3591 return sock_get_timestampns(sk, (struct timespec __user *)arg);
3601 case SIOCGIFBRDADDR:
3602 case SIOCSIFBRDADDR:
3603 case SIOCGIFNETMASK:
3604 case SIOCSIFNETMASK:
3605 case SIOCGIFDSTADDR:
3606 case SIOCSIFDSTADDR:
3608 return inet_dgram_ops.ioctl(sock, cmd, arg);
3612 return -ENOIOCTLCMD;
3617 static unsigned int packet_poll(struct file *file, struct socket *sock,
3620 struct sock *sk = sock->sk;
3621 struct packet_sock *po = pkt_sk(sk);
3622 unsigned int mask = datagram_poll(file, sock, wait);
3624 spin_lock_bh(&sk->sk_receive_queue.lock);
3625 if (po->rx_ring.pg_vec) {
3626 if (!packet_previous_rx_frame(po, &po->rx_ring,
3628 mask |= POLLIN | POLLRDNORM;
3630 spin_unlock_bh(&sk->sk_receive_queue.lock);
3631 spin_lock_bh(&sk->sk_write_queue.lock);
3632 if (po->tx_ring.pg_vec) {
3633 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3634 mask |= POLLOUT | POLLWRNORM;
3636 spin_unlock_bh(&sk->sk_write_queue.lock);
3641 /* Dirty? Well, I still did not learn better way to account
3645 static void packet_mm_open(struct vm_area_struct *vma)
3647 struct file *file = vma->vm_file;
3648 struct socket *sock = file->private_data;
3649 struct sock *sk = sock->sk;
3652 atomic_inc(&pkt_sk(sk)->mapped);
3655 static void packet_mm_close(struct vm_area_struct *vma)
3657 struct file *file = vma->vm_file;
3658 struct socket *sock = file->private_data;
3659 struct sock *sk = sock->sk;
3662 atomic_dec(&pkt_sk(sk)->mapped);
3665 static const struct vm_operations_struct packet_mmap_ops = {
3666 .open = packet_mm_open,
3667 .close = packet_mm_close,
3670 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3675 for (i = 0; i < len; i++) {
3676 if (likely(pg_vec[i].buffer)) {
3677 if (is_vmalloc_addr(pg_vec[i].buffer))
3678 vfree(pg_vec[i].buffer);
3680 free_pages((unsigned long)pg_vec[i].buffer,
3682 pg_vec[i].buffer = NULL;
3688 static char *alloc_one_pg_vec_page(unsigned long order)
3691 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3692 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3694 buffer = (char *) __get_free_pages(gfp_flags, order);
3698 /* __get_free_pages failed, fall back to vmalloc */
3699 buffer = vzalloc((1 << order) * PAGE_SIZE);
3703 /* vmalloc failed, lets dig into swap here */
3704 gfp_flags &= ~__GFP_NORETRY;
3705 buffer = (char *) __get_free_pages(gfp_flags, order);
3709 /* complete and utter failure */
3713 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
3715 unsigned int block_nr = req->tp_block_nr;
3719 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
3720 if (unlikely(!pg_vec))
3723 for (i = 0; i < block_nr; i++) {
3724 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
3725 if (unlikely(!pg_vec[i].buffer))
3726 goto out_free_pgvec;
3733 free_pg_vec(pg_vec, order, block_nr);
3738 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
3739 int closing, int tx_ring)
3741 struct pgv *pg_vec = NULL;
3742 struct packet_sock *po = pkt_sk(sk);
3743 int was_running, order = 0;
3744 struct packet_ring_buffer *rb;
3745 struct sk_buff_head *rb_queue;
3748 /* Added to avoid minimal code churn */
3749 struct tpacket_req *req = &req_u->req;
3751 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3752 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3753 WARN(1, "Tx-ring is not supported.\n");
3757 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3758 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
3762 if (atomic_read(&po->mapped))
3764 if (packet_read_pending(rb))
3768 if (req->tp_block_nr) {
3769 /* Sanity tests and some calculations */
3771 if (unlikely(rb->pg_vec))
3774 switch (po->tp_version) {
3776 po->tp_hdrlen = TPACKET_HDRLEN;
3779 po->tp_hdrlen = TPACKET2_HDRLEN;
3782 po->tp_hdrlen = TPACKET3_HDRLEN;
3787 if (unlikely((int)req->tp_block_size <= 0))
3789 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
3791 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
3794 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
3797 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3798 if (unlikely(rb->frames_per_block <= 0))
3800 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3805 order = get_order(req->tp_block_size);
3806 pg_vec = alloc_pg_vec(req, order);
3807 if (unlikely(!pg_vec))
3809 switch (po->tp_version) {
3811 /* Transmit path is not supported. We checked
3812 * it above but just being paranoid
3815 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3824 if (unlikely(req->tp_frame_nr))
3830 /* Detach socket from network */
3831 spin_lock(&po->bind_lock);
3832 was_running = po->running;
3836 __unregister_prot_hook(sk, false);
3838 spin_unlock(&po->bind_lock);
3843 mutex_lock(&po->pg_vec_lock);
3844 if (closing || atomic_read(&po->mapped) == 0) {
3846 spin_lock_bh(&rb_queue->lock);
3847 swap(rb->pg_vec, pg_vec);
3848 rb->frame_max = (req->tp_frame_nr - 1);
3850 rb->frame_size = req->tp_frame_size;
3851 spin_unlock_bh(&rb_queue->lock);
3853 swap(rb->pg_vec_order, order);
3854 swap(rb->pg_vec_len, req->tp_block_nr);
3856 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3857 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3858 tpacket_rcv : packet_rcv;
3859 skb_queue_purge(rb_queue);
3860 if (atomic_read(&po->mapped))
3861 pr_err("packet_mmap: vma is busy: %d\n",
3862 atomic_read(&po->mapped));
3864 mutex_unlock(&po->pg_vec_lock);
3866 spin_lock(&po->bind_lock);
3869 register_prot_hook(sk);
3871 spin_unlock(&po->bind_lock);
3872 if (closing && (po->tp_version > TPACKET_V2)) {
3873 /* Because we don't support block-based V3 on tx-ring */
3875 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3880 free_pg_vec(pg_vec, order, req->tp_block_nr);
3885 static int packet_mmap(struct file *file, struct socket *sock,
3886 struct vm_area_struct *vma)
3888 struct sock *sk = sock->sk;
3889 struct packet_sock *po = pkt_sk(sk);
3890 unsigned long size, expected_size;
3891 struct packet_ring_buffer *rb;
3892 unsigned long start;
3899 mutex_lock(&po->pg_vec_lock);
3902 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3904 expected_size += rb->pg_vec_len
3910 if (expected_size == 0)
3913 size = vma->vm_end - vma->vm_start;
3914 if (size != expected_size)
3917 start = vma->vm_start;
3918 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3919 if (rb->pg_vec == NULL)
3922 for (i = 0; i < rb->pg_vec_len; i++) {
3924 void *kaddr = rb->pg_vec[i].buffer;
3927 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3928 page = pgv_to_page(kaddr);
3929 err = vm_insert_page(vma, start, page);
3938 atomic_inc(&po->mapped);
3939 vma->vm_ops = &packet_mmap_ops;
3943 mutex_unlock(&po->pg_vec_lock);
3947 static const struct proto_ops packet_ops_spkt = {
3948 .family = PF_PACKET,
3949 .owner = THIS_MODULE,
3950 .release = packet_release,
3951 .bind = packet_bind_spkt,
3952 .connect = sock_no_connect,
3953 .socketpair = sock_no_socketpair,
3954 .accept = sock_no_accept,
3955 .getname = packet_getname_spkt,
3956 .poll = datagram_poll,
3957 .ioctl = packet_ioctl,
3958 .listen = sock_no_listen,
3959 .shutdown = sock_no_shutdown,
3960 .setsockopt = sock_no_setsockopt,
3961 .getsockopt = sock_no_getsockopt,
3962 .sendmsg = packet_sendmsg_spkt,
3963 .recvmsg = packet_recvmsg,
3964 .mmap = sock_no_mmap,
3965 .sendpage = sock_no_sendpage,
3968 static const struct proto_ops packet_ops = {
3969 .family = PF_PACKET,
3970 .owner = THIS_MODULE,
3971 .release = packet_release,
3972 .bind = packet_bind,
3973 .connect = sock_no_connect,
3974 .socketpair = sock_no_socketpair,
3975 .accept = sock_no_accept,
3976 .getname = packet_getname,
3977 .poll = packet_poll,
3978 .ioctl = packet_ioctl,
3979 .listen = sock_no_listen,
3980 .shutdown = sock_no_shutdown,
3981 .setsockopt = packet_setsockopt,
3982 .getsockopt = packet_getsockopt,
3983 .sendmsg = packet_sendmsg,
3984 .recvmsg = packet_recvmsg,
3985 .mmap = packet_mmap,
3986 .sendpage = sock_no_sendpage,
3989 static const struct net_proto_family packet_family_ops = {
3990 .family = PF_PACKET,
3991 .create = packet_create,
3992 .owner = THIS_MODULE,
3995 static struct notifier_block packet_netdev_notifier = {
3996 .notifier_call = packet_notifier,
3999 #ifdef CONFIG_PROC_FS
4001 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4004 struct net *net = seq_file_net(seq);
4007 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4010 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4012 struct net *net = seq_file_net(seq);
4013 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4016 static void packet_seq_stop(struct seq_file *seq, void *v)
4022 static int packet_seq_show(struct seq_file *seq, void *v)
4024 if (v == SEQ_START_TOKEN)
4025 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4027 struct sock *s = sk_entry(v);
4028 const struct packet_sock *po = pkt_sk(s);
4031 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4033 atomic_read(&s->sk_refcnt),
4038 atomic_read(&s->sk_rmem_alloc),
4039 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4046 static const struct seq_operations packet_seq_ops = {
4047 .start = packet_seq_start,
4048 .next = packet_seq_next,
4049 .stop = packet_seq_stop,
4050 .show = packet_seq_show,
4053 static int packet_seq_open(struct inode *inode, struct file *file)
4055 return seq_open_net(inode, file, &packet_seq_ops,
4056 sizeof(struct seq_net_private));
4059 static const struct file_operations packet_seq_fops = {
4060 .owner = THIS_MODULE,
4061 .open = packet_seq_open,
4063 .llseek = seq_lseek,
4064 .release = seq_release_net,
4069 static int __net_init packet_net_init(struct net *net)
4071 mutex_init(&net->packet.sklist_lock);
4072 INIT_HLIST_HEAD(&net->packet.sklist);
4074 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
4080 static void __net_exit packet_net_exit(struct net *net)
4082 remove_proc_entry("packet", net->proc_net);
4085 static struct pernet_operations packet_net_ops = {
4086 .init = packet_net_init,
4087 .exit = packet_net_exit,
4091 static void __exit packet_exit(void)
4093 unregister_netdevice_notifier(&packet_netdev_notifier);
4094 unregister_pernet_subsys(&packet_net_ops);
4095 sock_unregister(PF_PACKET);
4096 proto_unregister(&packet_proto);
4099 static int __init packet_init(void)
4101 int rc = proto_register(&packet_proto, 0);
4106 sock_register(&packet_family_ops);
4107 register_pernet_subsys(&packet_net_ops);
4108 register_netdevice_notifier(&packet_netdev_notifier);
4113 module_init(packet_init);
4114 module_exit(packet_exit);
4115 MODULE_LICENSE("GPL");
4116 MODULE_ALIAS_NETPROTO(PF_PACKET);