af_packet: Don't use skb after dev_queue_xmit()
[firefly-linux-kernel-4.4.55.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97
98 On receive:
99 -----------
100
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111                  header.  PPP makes it, that is wrong, because introduce
112                  assymetry between rx and tx paths.
113    data       -> data
114
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123 On transmit:
124 ------------
125
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137
138 /* Private packet socket structures. */
139
140 struct packet_mclist {
141         struct packet_mclist    *next;
142         int                     ifindex;
143         int                     count;
144         unsigned short          type;
145         unsigned short          alen;
146         unsigned char           addr[MAX_ADDR_LEN];
147 };
148 /* identical to struct packet_mreq except it has
149  * a longer address field.
150  */
151 struct packet_mreq_max {
152         int             mr_ifindex;
153         unsigned short  mr_type;
154         unsigned short  mr_alen;
155         unsigned char   mr_address[MAX_ADDR_LEN];
156 };
157
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
160                 int closing, int tx_ring);
161
162 struct packet_ring_buffer {
163         char                    **pg_vec;
164         unsigned int            head;
165         unsigned int            frames_per_block;
166         unsigned int            frame_size;
167         unsigned int            frame_max;
168
169         unsigned int            pg_vec_order;
170         unsigned int            pg_vec_pages;
171         unsigned int            pg_vec_len;
172
173         atomic_t                pending;
174 };
175
176 struct packet_sock;
177 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
178 #endif
179
180 static void packet_flush_mclist(struct sock *sk);
181
182 struct packet_sock {
183         /* struct sock has to be the first member of packet_sock */
184         struct sock             sk;
185         struct tpacket_stats    stats;
186 #ifdef CONFIG_PACKET_MMAP
187         struct packet_ring_buffer       rx_ring;
188         struct packet_ring_buffer       tx_ring;
189         int                     copy_thresh;
190 #endif
191         struct packet_type      prot_hook;
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1;
197         int                     ifindex;        /* bound device         */
198         __be16                  num;
199         struct packet_mclist    *mclist;
200 #ifdef CONFIG_PACKET_MMAP
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206 #endif
207 };
208
209 struct packet_skb_cb {
210         unsigned int origlen;
211         union {
212                 struct sockaddr_pkt pkt;
213                 struct sockaddr_ll ll;
214         } sa;
215 };
216
217 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
218
219 #ifdef CONFIG_PACKET_MMAP
220
221 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
222 {
223         union {
224                 struct tpacket_hdr *h1;
225                 struct tpacket2_hdr *h2;
226                 void *raw;
227         } h;
228
229         h.raw = frame;
230         switch (po->tp_version) {
231         case TPACKET_V1:
232                 h.h1->tp_status = status;
233                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
234                 break;
235         case TPACKET_V2:
236                 h.h2->tp_status = status;
237                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
238                 break;
239         default:
240                 pr_err("TPACKET version not supported\n");
241                 BUG();
242         }
243
244         smp_wmb();
245 }
246
247 static int __packet_get_status(struct packet_sock *po, void *frame)
248 {
249         union {
250                 struct tpacket_hdr *h1;
251                 struct tpacket2_hdr *h2;
252                 void *raw;
253         } h;
254
255         smp_rmb();
256
257         h.raw = frame;
258         switch (po->tp_version) {
259         case TPACKET_V1:
260                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
261                 return h.h1->tp_status;
262         case TPACKET_V2:
263                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
264                 return h.h2->tp_status;
265         default:
266                 pr_err("TPACKET version not supported\n");
267                 BUG();
268                 return 0;
269         }
270 }
271
272 static void *packet_lookup_frame(struct packet_sock *po,
273                 struct packet_ring_buffer *rb,
274                 unsigned int position,
275                 int status)
276 {
277         unsigned int pg_vec_pos, frame_offset;
278         union {
279                 struct tpacket_hdr *h1;
280                 struct tpacket2_hdr *h2;
281                 void *raw;
282         } h;
283
284         pg_vec_pos = position / rb->frames_per_block;
285         frame_offset = position % rb->frames_per_block;
286
287         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
288
289         if (status != __packet_get_status(po, h.raw))
290                 return NULL;
291
292         return h.raw;
293 }
294
295 static inline void *packet_current_frame(struct packet_sock *po,
296                 struct packet_ring_buffer *rb,
297                 int status)
298 {
299         return packet_lookup_frame(po, rb, rb->head, status);
300 }
301
302 static inline void *packet_previous_frame(struct packet_sock *po,
303                 struct packet_ring_buffer *rb,
304                 int status)
305 {
306         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
307         return packet_lookup_frame(po, rb, previous, status);
308 }
309
310 static inline void packet_increment_head(struct packet_ring_buffer *buff)
311 {
312         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
313 }
314
315 #endif
316
317 static inline struct packet_sock *pkt_sk(struct sock *sk)
318 {
319         return (struct packet_sock *)sk;
320 }
321
322 static void packet_sock_destruct(struct sock *sk)
323 {
324         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
325         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
326
327         if (!sock_flag(sk, SOCK_DEAD)) {
328                 pr_err("Attempt to release alive packet socket: %p\n", sk);
329                 return;
330         }
331
332         sk_refcnt_debug_dec(sk);
333 }
334
335
336 static const struct proto_ops packet_ops;
337
338 static const struct proto_ops packet_ops_spkt;
339
340 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
341                            struct packet_type *pt, struct net_device *orig_dev)
342 {
343         struct sock *sk;
344         struct sockaddr_pkt *spkt;
345
346         /*
347          *      When we registered the protocol we saved the socket in the data
348          *      field for just this event.
349          */
350
351         sk = pt->af_packet_priv;
352
353         /*
354          *      Yank back the headers [hope the device set this
355          *      right or kerboom...]
356          *
357          *      Incoming packets have ll header pulled,
358          *      push it back.
359          *
360          *      For outgoing ones skb->data == skb_mac_header(skb)
361          *      so that this procedure is noop.
362          */
363
364         if (skb->pkt_type == PACKET_LOOPBACK)
365                 goto out;
366
367         if (dev_net(dev) != sock_net(sk))
368                 goto out;
369
370         skb = skb_share_check(skb, GFP_ATOMIC);
371         if (skb == NULL)
372                 goto oom;
373
374         /* drop any routing info */
375         skb_dst_drop(skb);
376
377         /* drop conntrack reference */
378         nf_reset(skb);
379
380         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
381
382         skb_push(skb, skb->data - skb_mac_header(skb));
383
384         /*
385          *      The SOCK_PACKET socket receives _all_ frames.
386          */
387
388         spkt->spkt_family = dev->type;
389         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
390         spkt->spkt_protocol = skb->protocol;
391
392         /*
393          *      Charge the memory to the socket. This is done specifically
394          *      to prevent sockets using all the memory up.
395          */
396
397         if (sock_queue_rcv_skb(sk, skb) == 0)
398                 return 0;
399
400 out:
401         kfree_skb(skb);
402 oom:
403         return 0;
404 }
405
406
407 /*
408  *      Output a raw packet to a device layer. This bypasses all the other
409  *      protocol layers and you must therefore supply it with a complete frame
410  */
411
412 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
413                                struct msghdr *msg, size_t len)
414 {
415         struct sock *sk = sock->sk;
416         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
417         struct sk_buff *skb;
418         struct net_device *dev;
419         __be16 proto = 0;
420         int err;
421
422         /*
423          *      Get and verify the address.
424          */
425
426         if (saddr) {
427                 if (msg->msg_namelen < sizeof(struct sockaddr))
428                         return -EINVAL;
429                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
430                         proto = saddr->spkt_protocol;
431         } else
432                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
433
434         /*
435          *      Find the device first to size check it
436          */
437
438         saddr->spkt_device[13] = 0;
439         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
440         err = -ENODEV;
441         if (dev == NULL)
442                 goto out_unlock;
443
444         err = -ENETDOWN;
445         if (!(dev->flags & IFF_UP))
446                 goto out_unlock;
447
448         /*
449          * You may not queue a frame bigger than the mtu. This is the lowest level
450          * raw protocol and you must do your own fragmentation at this level.
451          */
452
453         err = -EMSGSIZE;
454         if (len > dev->mtu + dev->hard_header_len)
455                 goto out_unlock;
456
457         err = -ENOBUFS;
458         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
459
460         /*
461          * If the write buffer is full, then tough. At this level the user
462          * gets to deal with the problem - do your own algorithmic backoffs.
463          * That's far more flexible.
464          */
465
466         if (skb == NULL)
467                 goto out_unlock;
468
469         /*
470          *      Fill it in
471          */
472
473         /* FIXME: Save some space for broken drivers that write a
474          * hard header at transmission time by themselves. PPP is the
475          * notable one here. This should really be fixed at the driver level.
476          */
477         skb_reserve(skb, LL_RESERVED_SPACE(dev));
478         skb_reset_network_header(skb);
479
480         /* Try to align data part correctly */
481         if (dev->header_ops) {
482                 skb->data -= dev->hard_header_len;
483                 skb->tail -= dev->hard_header_len;
484                 if (len < dev->hard_header_len)
485                         skb_reset_network_header(skb);
486         }
487
488         /* Returns -EFAULT on error */
489         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
490         skb->protocol = proto;
491         skb->dev = dev;
492         skb->priority = sk->sk_priority;
493         if (err)
494                 goto out_free;
495
496         /*
497          *      Now send it
498          */
499
500         dev_queue_xmit(skb);
501         dev_put(dev);
502         return len;
503
504 out_free:
505         kfree_skb(skb);
506 out_unlock:
507         if (dev)
508                 dev_put(dev);
509         return err;
510 }
511
512 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
513                                       unsigned int res)
514 {
515         struct sk_filter *filter;
516
517         rcu_read_lock_bh();
518         filter = rcu_dereference(sk->sk_filter);
519         if (filter != NULL)
520                 res = sk_run_filter(skb, filter->insns, filter->len);
521         rcu_read_unlock_bh();
522
523         return res;
524 }
525
526 /*
527    This function makes lazy skb cloning in hope that most of packets
528    are discarded by BPF.
529
530    Note tricky part: we DO mangle shared skb! skb->data, skb->len
531    and skb->cb are mangled. It works because (and until) packets
532    falling here are owned by current CPU. Output packets are cloned
533    by dev_queue_xmit_nit(), input packets are processed by net_bh
534    sequencially, so that if we return skb to original state on exit,
535    we will not harm anyone.
536  */
537
538 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
539                       struct packet_type *pt, struct net_device *orig_dev)
540 {
541         struct sock *sk;
542         struct sockaddr_ll *sll;
543         struct packet_sock *po;
544         u8 *skb_head = skb->data;
545         int skb_len = skb->len;
546         unsigned int snaplen, res;
547
548         if (skb->pkt_type == PACKET_LOOPBACK)
549                 goto drop;
550
551         sk = pt->af_packet_priv;
552         po = pkt_sk(sk);
553
554         if (dev_net(dev) != sock_net(sk))
555                 goto drop;
556
557         skb->dev = dev;
558
559         if (dev->header_ops) {
560                 /* The device has an explicit notion of ll header,
561                    exported to higher levels.
562
563                    Otherwise, the device hides datails of it frame
564                    structure, so that corresponding packet head
565                    never delivered to user.
566                  */
567                 if (sk->sk_type != SOCK_DGRAM)
568                         skb_push(skb, skb->data - skb_mac_header(skb));
569                 else if (skb->pkt_type == PACKET_OUTGOING) {
570                         /* Special case: outgoing packets have ll header at head */
571                         skb_pull(skb, skb_network_offset(skb));
572                 }
573         }
574
575         snaplen = skb->len;
576
577         res = run_filter(skb, sk, snaplen);
578         if (!res)
579                 goto drop_n_restore;
580         if (snaplen > res)
581                 snaplen = res;
582
583         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
584             (unsigned)sk->sk_rcvbuf)
585                 goto drop_n_acct;
586
587         if (skb_shared(skb)) {
588                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
589                 if (nskb == NULL)
590                         goto drop_n_acct;
591
592                 if (skb_head != skb->data) {
593                         skb->data = skb_head;
594                         skb->len = skb_len;
595                 }
596                 kfree_skb(skb);
597                 skb = nskb;
598         }
599
600         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
601                      sizeof(skb->cb));
602
603         sll = &PACKET_SKB_CB(skb)->sa.ll;
604         sll->sll_family = AF_PACKET;
605         sll->sll_hatype = dev->type;
606         sll->sll_protocol = skb->protocol;
607         sll->sll_pkttype = skb->pkt_type;
608         if (unlikely(po->origdev))
609                 sll->sll_ifindex = orig_dev->ifindex;
610         else
611                 sll->sll_ifindex = dev->ifindex;
612
613         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
614
615         PACKET_SKB_CB(skb)->origlen = skb->len;
616
617         if (pskb_trim(skb, snaplen))
618                 goto drop_n_acct;
619
620         skb_set_owner_r(skb, sk);
621         skb->dev = NULL;
622         skb_dst_drop(skb);
623
624         /* drop conntrack reference */
625         nf_reset(skb);
626
627         spin_lock(&sk->sk_receive_queue.lock);
628         po->stats.tp_packets++;
629         __skb_queue_tail(&sk->sk_receive_queue, skb);
630         spin_unlock(&sk->sk_receive_queue.lock);
631         sk->sk_data_ready(sk, skb->len);
632         return 0;
633
634 drop_n_acct:
635         spin_lock(&sk->sk_receive_queue.lock);
636         po->stats.tp_drops++;
637         spin_unlock(&sk->sk_receive_queue.lock);
638
639 drop_n_restore:
640         if (skb_head != skb->data && skb_shared(skb)) {
641                 skb->data = skb_head;
642                 skb->len = skb_len;
643         }
644 drop:
645         consume_skb(skb);
646         return 0;
647 }
648
649 #ifdef CONFIG_PACKET_MMAP
650 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
651                        struct packet_type *pt, struct net_device *orig_dev)
652 {
653         struct sock *sk;
654         struct packet_sock *po;
655         struct sockaddr_ll *sll;
656         union {
657                 struct tpacket_hdr *h1;
658                 struct tpacket2_hdr *h2;
659                 void *raw;
660         } h;
661         u8 *skb_head = skb->data;
662         int skb_len = skb->len;
663         unsigned int snaplen, res;
664         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
665         unsigned short macoff, netoff, hdrlen;
666         struct sk_buff *copy_skb = NULL;
667         struct timeval tv;
668         struct timespec ts;
669
670         if (skb->pkt_type == PACKET_LOOPBACK)
671                 goto drop;
672
673         sk = pt->af_packet_priv;
674         po = pkt_sk(sk);
675
676         if (dev_net(dev) != sock_net(sk))
677                 goto drop;
678
679         if (dev->header_ops) {
680                 if (sk->sk_type != SOCK_DGRAM)
681                         skb_push(skb, skb->data - skb_mac_header(skb));
682                 else if (skb->pkt_type == PACKET_OUTGOING) {
683                         /* Special case: outgoing packets have ll header at head */
684                         skb_pull(skb, skb_network_offset(skb));
685                 }
686         }
687
688         if (skb->ip_summed == CHECKSUM_PARTIAL)
689                 status |= TP_STATUS_CSUMNOTREADY;
690
691         snaplen = skb->len;
692
693         res = run_filter(skb, sk, snaplen);
694         if (!res)
695                 goto drop_n_restore;
696         if (snaplen > res)
697                 snaplen = res;
698
699         if (sk->sk_type == SOCK_DGRAM) {
700                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
701                                   po->tp_reserve;
702         } else {
703                 unsigned maclen = skb_network_offset(skb);
704                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
705                                        (maclen < 16 ? 16 : maclen)) +
706                         po->tp_reserve;
707                 macoff = netoff - maclen;
708         }
709
710         if (macoff + snaplen > po->rx_ring.frame_size) {
711                 if (po->copy_thresh &&
712                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
713                     (unsigned)sk->sk_rcvbuf) {
714                         if (skb_shared(skb)) {
715                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
716                         } else {
717                                 copy_skb = skb_get(skb);
718                                 skb_head = skb->data;
719                         }
720                         if (copy_skb)
721                                 skb_set_owner_r(copy_skb, sk);
722                 }
723                 snaplen = po->rx_ring.frame_size - macoff;
724                 if ((int)snaplen < 0)
725                         snaplen = 0;
726         }
727
728         spin_lock(&sk->sk_receive_queue.lock);
729         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
730         if (!h.raw)
731                 goto ring_is_full;
732         packet_increment_head(&po->rx_ring);
733         po->stats.tp_packets++;
734         if (copy_skb) {
735                 status |= TP_STATUS_COPY;
736                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
737         }
738         if (!po->stats.tp_drops)
739                 status &= ~TP_STATUS_LOSING;
740         spin_unlock(&sk->sk_receive_queue.lock);
741
742         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
743
744         switch (po->tp_version) {
745         case TPACKET_V1:
746                 h.h1->tp_len = skb->len;
747                 h.h1->tp_snaplen = snaplen;
748                 h.h1->tp_mac = macoff;
749                 h.h1->tp_net = netoff;
750                 if (skb->tstamp.tv64)
751                         tv = ktime_to_timeval(skb->tstamp);
752                 else
753                         do_gettimeofday(&tv);
754                 h.h1->tp_sec = tv.tv_sec;
755                 h.h1->tp_usec = tv.tv_usec;
756                 hdrlen = sizeof(*h.h1);
757                 break;
758         case TPACKET_V2:
759                 h.h2->tp_len = skb->len;
760                 h.h2->tp_snaplen = snaplen;
761                 h.h2->tp_mac = macoff;
762                 h.h2->tp_net = netoff;
763                 if (skb->tstamp.tv64)
764                         ts = ktime_to_timespec(skb->tstamp);
765                 else
766                         getnstimeofday(&ts);
767                 h.h2->tp_sec = ts.tv_sec;
768                 h.h2->tp_nsec = ts.tv_nsec;
769                 h.h2->tp_vlan_tci = skb->vlan_tci;
770                 hdrlen = sizeof(*h.h2);
771                 break;
772         default:
773                 BUG();
774         }
775
776         sll = h.raw + TPACKET_ALIGN(hdrlen);
777         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
778         sll->sll_family = AF_PACKET;
779         sll->sll_hatype = dev->type;
780         sll->sll_protocol = skb->protocol;
781         sll->sll_pkttype = skb->pkt_type;
782         if (unlikely(po->origdev))
783                 sll->sll_ifindex = orig_dev->ifindex;
784         else
785                 sll->sll_ifindex = dev->ifindex;
786
787         __packet_set_status(po, h.raw, status);
788         smp_mb();
789         {
790                 struct page *p_start, *p_end;
791                 u8 *h_end = h.raw + macoff + snaplen - 1;
792
793                 p_start = virt_to_page(h.raw);
794                 p_end = virt_to_page(h_end);
795                 while (p_start <= p_end) {
796                         flush_dcache_page(p_start);
797                         p_start++;
798                 }
799         }
800
801         sk->sk_data_ready(sk, 0);
802
803 drop_n_restore:
804         if (skb_head != skb->data && skb_shared(skb)) {
805                 skb->data = skb_head;
806                 skb->len = skb_len;
807         }
808 drop:
809         kfree_skb(skb);
810         return 0;
811
812 ring_is_full:
813         po->stats.tp_drops++;
814         spin_unlock(&sk->sk_receive_queue.lock);
815
816         sk->sk_data_ready(sk, 0);
817         kfree_skb(copy_skb);
818         goto drop_n_restore;
819 }
820
821 static void tpacket_destruct_skb(struct sk_buff *skb)
822 {
823         struct packet_sock *po = pkt_sk(skb->sk);
824         void *ph;
825
826         BUG_ON(skb == NULL);
827
828         if (likely(po->tx_ring.pg_vec)) {
829                 ph = skb_shinfo(skb)->destructor_arg;
830                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
831                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
832                 atomic_dec(&po->tx_ring.pending);
833                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
834         }
835
836         sock_wfree(skb);
837 }
838
839 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
840                 void *frame, struct net_device *dev, int size_max,
841                 __be16 proto, unsigned char *addr)
842 {
843         union {
844                 struct tpacket_hdr *h1;
845                 struct tpacket2_hdr *h2;
846                 void *raw;
847         } ph;
848         int to_write, offset, len, tp_len, nr_frags, len_max;
849         struct socket *sock = po->sk.sk_socket;
850         struct page *page;
851         void *data;
852         int err;
853
854         ph.raw = frame;
855
856         skb->protocol = proto;
857         skb->dev = dev;
858         skb->priority = po->sk.sk_priority;
859         skb_shinfo(skb)->destructor_arg = ph.raw;
860
861         switch (po->tp_version) {
862         case TPACKET_V2:
863                 tp_len = ph.h2->tp_len;
864                 break;
865         default:
866                 tp_len = ph.h1->tp_len;
867                 break;
868         }
869         if (unlikely(tp_len > size_max)) {
870                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
871                 return -EMSGSIZE;
872         }
873
874         skb_reserve(skb, LL_RESERVED_SPACE(dev));
875         skb_reset_network_header(skb);
876
877         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
878         to_write = tp_len;
879
880         if (sock->type == SOCK_DGRAM) {
881                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
882                                 NULL, tp_len);
883                 if (unlikely(err < 0))
884                         return -EINVAL;
885         } else if (dev->hard_header_len) {
886                 /* net device doesn't like empty head */
887                 if (unlikely(tp_len <= dev->hard_header_len)) {
888                         pr_err("packet size is too short (%d < %d)\n",
889                                tp_len, dev->hard_header_len);
890                         return -EINVAL;
891                 }
892
893                 skb_push(skb, dev->hard_header_len);
894                 err = skb_store_bits(skb, 0, data,
895                                 dev->hard_header_len);
896                 if (unlikely(err))
897                         return err;
898
899                 data += dev->hard_header_len;
900                 to_write -= dev->hard_header_len;
901         }
902
903         err = -EFAULT;
904         page = virt_to_page(data);
905         offset = offset_in_page(data);
906         len_max = PAGE_SIZE - offset;
907         len = ((to_write > len_max) ? len_max : to_write);
908
909         skb->data_len = to_write;
910         skb->len += to_write;
911         skb->truesize += to_write;
912         atomic_add(to_write, &po->sk.sk_wmem_alloc);
913
914         while (likely(to_write)) {
915                 nr_frags = skb_shinfo(skb)->nr_frags;
916
917                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
918                         pr_err("Packet exceed the number of skb frags(%lu)\n",
919                                MAX_SKB_FRAGS);
920                         return -EFAULT;
921                 }
922
923                 flush_dcache_page(page);
924                 get_page(page);
925                 skb_fill_page_desc(skb,
926                                 nr_frags,
927                                 page++, offset, len);
928                 to_write -= len;
929                 offset = 0;
930                 len_max = PAGE_SIZE;
931                 len = ((to_write > len_max) ? len_max : to_write);
932         }
933
934         return tp_len;
935 }
936
937 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
938 {
939         struct socket *sock;
940         struct sk_buff *skb;
941         struct net_device *dev;
942         __be16 proto;
943         int ifindex, err, reserve = 0;
944         void *ph;
945         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
946         int tp_len, size_max;
947         unsigned char *addr;
948         int len_sum = 0;
949         int status = 0;
950
951         sock = po->sk.sk_socket;
952
953         mutex_lock(&po->pg_vec_lock);
954
955         err = -EBUSY;
956         if (saddr == NULL) {
957                 ifindex = po->ifindex;
958                 proto   = po->num;
959                 addr    = NULL;
960         } else {
961                 err = -EINVAL;
962                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
963                         goto out;
964                 if (msg->msg_namelen < (saddr->sll_halen
965                                         + offsetof(struct sockaddr_ll,
966                                                 sll_addr)))
967                         goto out;
968                 ifindex = saddr->sll_ifindex;
969                 proto   = saddr->sll_protocol;
970                 addr    = saddr->sll_addr;
971         }
972
973         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
974         err = -ENXIO;
975         if (unlikely(dev == NULL))
976                 goto out;
977
978         reserve = dev->hard_header_len;
979
980         err = -ENETDOWN;
981         if (unlikely(!(dev->flags & IFF_UP)))
982                 goto out_put;
983
984         size_max = po->tx_ring.frame_size
985                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
986
987         if (size_max > dev->mtu + reserve)
988                 size_max = dev->mtu + reserve;
989
990         do {
991                 ph = packet_current_frame(po, &po->tx_ring,
992                                 TP_STATUS_SEND_REQUEST);
993
994                 if (unlikely(ph == NULL)) {
995                         schedule();
996                         continue;
997                 }
998
999                 status = TP_STATUS_SEND_REQUEST;
1000                 skb = sock_alloc_send_skb(&po->sk,
1001                                 LL_ALLOCATED_SPACE(dev)
1002                                 + sizeof(struct sockaddr_ll),
1003                                 0, &err);
1004
1005                 if (unlikely(skb == NULL))
1006                         goto out_status;
1007
1008                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1009                                 addr);
1010
1011                 if (unlikely(tp_len < 0)) {
1012                         if (po->tp_loss) {
1013                                 __packet_set_status(po, ph,
1014                                                 TP_STATUS_AVAILABLE);
1015                                 packet_increment_head(&po->tx_ring);
1016                                 kfree_skb(skb);
1017                                 continue;
1018                         } else {
1019                                 status = TP_STATUS_WRONG_FORMAT;
1020                                 err = tp_len;
1021                                 goto out_status;
1022                         }
1023                 }
1024
1025                 skb->destructor = tpacket_destruct_skb;
1026                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1027                 atomic_inc(&po->tx_ring.pending);
1028
1029                 status = TP_STATUS_SEND_REQUEST;
1030                 err = dev_queue_xmit(skb);
1031                 if (unlikely(err > 0)) {
1032                         err = net_xmit_errno(err);
1033                         if (err && __packet_get_status(po, ph) ==
1034                                    TP_STATUS_AVAILABLE) {
1035                                 /* skb was destructed already */
1036                                 skb = NULL;
1037                                 goto out_status;
1038                         }
1039                         /*
1040                          * skb was dropped but not destructed yet;
1041                          * let's treat it like congestion or err < 0
1042                          */
1043                         err = 0;
1044                 }
1045                 packet_increment_head(&po->tx_ring);
1046                 len_sum += tp_len;
1047         } while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1048                                         && (atomic_read(&po->tx_ring.pending))))
1049               );
1050
1051         err = len_sum;
1052         goto out_put;
1053
1054 out_status:
1055         __packet_set_status(po, ph, status);
1056         kfree_skb(skb);
1057 out_put:
1058         dev_put(dev);
1059 out:
1060         mutex_unlock(&po->pg_vec_lock);
1061         return err;
1062 }
1063 #endif
1064
1065 static int packet_snd(struct socket *sock,
1066                           struct msghdr *msg, size_t len)
1067 {
1068         struct sock *sk = sock->sk;
1069         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1070         struct sk_buff *skb;
1071         struct net_device *dev;
1072         __be16 proto;
1073         unsigned char *addr;
1074         int ifindex, err, reserve = 0;
1075
1076         /*
1077          *      Get and verify the address.
1078          */
1079
1080         if (saddr == NULL) {
1081                 struct packet_sock *po = pkt_sk(sk);
1082
1083                 ifindex = po->ifindex;
1084                 proto   = po->num;
1085                 addr    = NULL;
1086         } else {
1087                 err = -EINVAL;
1088                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1089                         goto out;
1090                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1091                         goto out;
1092                 ifindex = saddr->sll_ifindex;
1093                 proto   = saddr->sll_protocol;
1094                 addr    = saddr->sll_addr;
1095         }
1096
1097
1098         dev = dev_get_by_index(sock_net(sk), ifindex);
1099         err = -ENXIO;
1100         if (dev == NULL)
1101                 goto out_unlock;
1102         if (sock->type == SOCK_RAW)
1103                 reserve = dev->hard_header_len;
1104
1105         err = -ENETDOWN;
1106         if (!(dev->flags & IFF_UP))
1107                 goto out_unlock;
1108
1109         err = -EMSGSIZE;
1110         if (len > dev->mtu+reserve)
1111                 goto out_unlock;
1112
1113         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1114                                 msg->msg_flags & MSG_DONTWAIT, &err);
1115         if (skb == NULL)
1116                 goto out_unlock;
1117
1118         skb_reserve(skb, LL_RESERVED_SPACE(dev));
1119         skb_reset_network_header(skb);
1120
1121         err = -EINVAL;
1122         if (sock->type == SOCK_DGRAM &&
1123             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1124                 goto out_free;
1125
1126         /* Returns -EFAULT on error */
1127         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1128         if (err)
1129                 goto out_free;
1130
1131         skb->protocol = proto;
1132         skb->dev = dev;
1133         skb->priority = sk->sk_priority;
1134
1135         /*
1136          *      Now send it
1137          */
1138
1139         err = dev_queue_xmit(skb);
1140         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1141                 goto out_unlock;
1142
1143         dev_put(dev);
1144
1145         return len;
1146
1147 out_free:
1148         kfree_skb(skb);
1149 out_unlock:
1150         if (dev)
1151                 dev_put(dev);
1152 out:
1153         return err;
1154 }
1155
1156 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1157                 struct msghdr *msg, size_t len)
1158 {
1159 #ifdef CONFIG_PACKET_MMAP
1160         struct sock *sk = sock->sk;
1161         struct packet_sock *po = pkt_sk(sk);
1162         if (po->tx_ring.pg_vec)
1163                 return tpacket_snd(po, msg);
1164         else
1165 #endif
1166                 return packet_snd(sock, msg, len);
1167 }
1168
1169 /*
1170  *      Close a PACKET socket. This is fairly simple. We immediately go
1171  *      to 'closed' state and remove our protocol entry in the device list.
1172  */
1173
1174 static int packet_release(struct socket *sock)
1175 {
1176         struct sock *sk = sock->sk;
1177         struct packet_sock *po;
1178         struct net *net;
1179 #ifdef CONFIG_PACKET_MMAP
1180         struct tpacket_req req;
1181 #endif
1182
1183         if (!sk)
1184                 return 0;
1185
1186         net = sock_net(sk);
1187         po = pkt_sk(sk);
1188
1189         write_lock_bh(&net->packet.sklist_lock);
1190         sk_del_node_init(sk);
1191         sock_prot_inuse_add(net, sk->sk_prot, -1);
1192         write_unlock_bh(&net->packet.sklist_lock);
1193
1194         /*
1195          *      Unhook packet receive handler.
1196          */
1197
1198         if (po->running) {
1199                 /*
1200                  *      Remove the protocol hook
1201                  */
1202                 dev_remove_pack(&po->prot_hook);
1203                 po->running = 0;
1204                 po->num = 0;
1205                 __sock_put(sk);
1206         }
1207
1208         packet_flush_mclist(sk);
1209
1210 #ifdef CONFIG_PACKET_MMAP
1211         memset(&req, 0, sizeof(req));
1212
1213         if (po->rx_ring.pg_vec)
1214                 packet_set_ring(sk, &req, 1, 0);
1215
1216         if (po->tx_ring.pg_vec)
1217                 packet_set_ring(sk, &req, 1, 1);
1218 #endif
1219
1220         /*
1221          *      Now the socket is dead. No more input will appear.
1222          */
1223
1224         sock_orphan(sk);
1225         sock->sk = NULL;
1226
1227         /* Purge queues */
1228
1229         skb_queue_purge(&sk->sk_receive_queue);
1230         sk_refcnt_debug_release(sk);
1231
1232         sock_put(sk);
1233         return 0;
1234 }
1235
1236 /*
1237  *      Attach a packet hook.
1238  */
1239
1240 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1241 {
1242         struct packet_sock *po = pkt_sk(sk);
1243         /*
1244          *      Detach an existing hook if present.
1245          */
1246
1247         lock_sock(sk);
1248
1249         spin_lock(&po->bind_lock);
1250         if (po->running) {
1251                 __sock_put(sk);
1252                 po->running = 0;
1253                 po->num = 0;
1254                 spin_unlock(&po->bind_lock);
1255                 dev_remove_pack(&po->prot_hook);
1256                 spin_lock(&po->bind_lock);
1257         }
1258
1259         po->num = protocol;
1260         po->prot_hook.type = protocol;
1261         po->prot_hook.dev = dev;
1262
1263         po->ifindex = dev ? dev->ifindex : 0;
1264
1265         if (protocol == 0)
1266                 goto out_unlock;
1267
1268         if (!dev || (dev->flags & IFF_UP)) {
1269                 dev_add_pack(&po->prot_hook);
1270                 sock_hold(sk);
1271                 po->running = 1;
1272         } else {
1273                 sk->sk_err = ENETDOWN;
1274                 if (!sock_flag(sk, SOCK_DEAD))
1275                         sk->sk_error_report(sk);
1276         }
1277
1278 out_unlock:
1279         spin_unlock(&po->bind_lock);
1280         release_sock(sk);
1281         return 0;
1282 }
1283
1284 /*
1285  *      Bind a packet socket to a device
1286  */
1287
1288 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1289                             int addr_len)
1290 {
1291         struct sock *sk = sock->sk;
1292         char name[15];
1293         struct net_device *dev;
1294         int err = -ENODEV;
1295
1296         /*
1297          *      Check legality
1298          */
1299
1300         if (addr_len != sizeof(struct sockaddr))
1301                 return -EINVAL;
1302         strlcpy(name, uaddr->sa_data, sizeof(name));
1303
1304         dev = dev_get_by_name(sock_net(sk), name);
1305         if (dev) {
1306                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1307                 dev_put(dev);
1308         }
1309         return err;
1310 }
1311
1312 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1313 {
1314         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1315         struct sock *sk = sock->sk;
1316         struct net_device *dev = NULL;
1317         int err;
1318
1319
1320         /*
1321          *      Check legality
1322          */
1323
1324         if (addr_len < sizeof(struct sockaddr_ll))
1325                 return -EINVAL;
1326         if (sll->sll_family != AF_PACKET)
1327                 return -EINVAL;
1328
1329         if (sll->sll_ifindex) {
1330                 err = -ENODEV;
1331                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1332                 if (dev == NULL)
1333                         goto out;
1334         }
1335         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1336         if (dev)
1337                 dev_put(dev);
1338
1339 out:
1340         return err;
1341 }
1342
1343 static struct proto packet_proto = {
1344         .name     = "PACKET",
1345         .owner    = THIS_MODULE,
1346         .obj_size = sizeof(struct packet_sock),
1347 };
1348
1349 /*
1350  *      Create a packet of type SOCK_PACKET.
1351  */
1352
1353 static int packet_create(struct net *net, struct socket *sock, int protocol)
1354 {
1355         struct sock *sk;
1356         struct packet_sock *po;
1357         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1358         int err;
1359
1360         if (!capable(CAP_NET_RAW))
1361                 return -EPERM;
1362         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1363             sock->type != SOCK_PACKET)
1364                 return -ESOCKTNOSUPPORT;
1365
1366         sock->state = SS_UNCONNECTED;
1367
1368         err = -ENOBUFS;
1369         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1370         if (sk == NULL)
1371                 goto out;
1372
1373         sock->ops = &packet_ops;
1374         if (sock->type == SOCK_PACKET)
1375                 sock->ops = &packet_ops_spkt;
1376
1377         sock_init_data(sock, sk);
1378
1379         po = pkt_sk(sk);
1380         sk->sk_family = PF_PACKET;
1381         po->num = proto;
1382
1383         sk->sk_destruct = packet_sock_destruct;
1384         sk_refcnt_debug_inc(sk);
1385
1386         /*
1387          *      Attach a protocol block
1388          */
1389
1390         spin_lock_init(&po->bind_lock);
1391         mutex_init(&po->pg_vec_lock);
1392         po->prot_hook.func = packet_rcv;
1393
1394         if (sock->type == SOCK_PACKET)
1395                 po->prot_hook.func = packet_rcv_spkt;
1396
1397         po->prot_hook.af_packet_priv = sk;
1398
1399         if (proto) {
1400                 po->prot_hook.type = proto;
1401                 dev_add_pack(&po->prot_hook);
1402                 sock_hold(sk);
1403                 po->running = 1;
1404         }
1405
1406         write_lock_bh(&net->packet.sklist_lock);
1407         sk_add_node(sk, &net->packet.sklist);
1408         sock_prot_inuse_add(net, &packet_proto, 1);
1409         write_unlock_bh(&net->packet.sklist_lock);
1410         return 0;
1411 out:
1412         return err;
1413 }
1414
1415 /*
1416  *      Pull a packet from our receive queue and hand it to the user.
1417  *      If necessary we block.
1418  */
1419
1420 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1421                           struct msghdr *msg, size_t len, int flags)
1422 {
1423         struct sock *sk = sock->sk;
1424         struct sk_buff *skb;
1425         int copied, err;
1426         struct sockaddr_ll *sll;
1427
1428         err = -EINVAL;
1429         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1430                 goto out;
1431
1432 #if 0
1433         /* What error should we return now? EUNATTACH? */
1434         if (pkt_sk(sk)->ifindex < 0)
1435                 return -ENODEV;
1436 #endif
1437
1438         /*
1439          *      Call the generic datagram receiver. This handles all sorts
1440          *      of horrible races and re-entrancy so we can forget about it
1441          *      in the protocol layers.
1442          *
1443          *      Now it will return ENETDOWN, if device have just gone down,
1444          *      but then it will block.
1445          */
1446
1447         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1448
1449         /*
1450          *      An error occurred so return it. Because skb_recv_datagram()
1451          *      handles the blocking we don't see and worry about blocking
1452          *      retries.
1453          */
1454
1455         if (skb == NULL)
1456                 goto out;
1457
1458         /*
1459          *      If the address length field is there to be filled in, we fill
1460          *      it in now.
1461          */
1462
1463         sll = &PACKET_SKB_CB(skb)->sa.ll;
1464         if (sock->type == SOCK_PACKET)
1465                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1466         else
1467                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1468
1469         /*
1470          *      You lose any data beyond the buffer you gave. If it worries a
1471          *      user program they can ask the device for its MTU anyway.
1472          */
1473
1474         copied = skb->len;
1475         if (copied > len) {
1476                 copied = len;
1477                 msg->msg_flags |= MSG_TRUNC;
1478         }
1479
1480         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1481         if (err)
1482                 goto out_free;
1483
1484         sock_recv_timestamp(msg, sk, skb);
1485
1486         if (msg->msg_name)
1487                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1488                        msg->msg_namelen);
1489
1490         if (pkt_sk(sk)->auxdata) {
1491                 struct tpacket_auxdata aux;
1492
1493                 aux.tp_status = TP_STATUS_USER;
1494                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1495                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1496                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1497                 aux.tp_snaplen = skb->len;
1498                 aux.tp_mac = 0;
1499                 aux.tp_net = skb_network_offset(skb);
1500                 aux.tp_vlan_tci = skb->vlan_tci;
1501
1502                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1503         }
1504
1505         /*
1506          *      Free or return the buffer as appropriate. Again this
1507          *      hides all the races and re-entrancy issues from us.
1508          */
1509         err = (flags&MSG_TRUNC) ? skb->len : copied;
1510
1511 out_free:
1512         skb_free_datagram(sk, skb);
1513 out:
1514         return err;
1515 }
1516
1517 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1518                                int *uaddr_len, int peer)
1519 {
1520         struct net_device *dev;
1521         struct sock *sk = sock->sk;
1522
1523         if (peer)
1524                 return -EOPNOTSUPP;
1525
1526         uaddr->sa_family = AF_PACKET;
1527         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1528         if (dev) {
1529                 strlcpy(uaddr->sa_data, dev->name, 15);
1530                 dev_put(dev);
1531         } else
1532                 memset(uaddr->sa_data, 0, 14);
1533         *uaddr_len = sizeof(*uaddr);
1534
1535         return 0;
1536 }
1537
1538 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1539                           int *uaddr_len, int peer)
1540 {
1541         struct net_device *dev;
1542         struct sock *sk = sock->sk;
1543         struct packet_sock *po = pkt_sk(sk);
1544         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1545
1546         if (peer)
1547                 return -EOPNOTSUPP;
1548
1549         sll->sll_family = AF_PACKET;
1550         sll->sll_ifindex = po->ifindex;
1551         sll->sll_protocol = po->num;
1552         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1553         if (dev) {
1554                 sll->sll_hatype = dev->type;
1555                 sll->sll_halen = dev->addr_len;
1556                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1557                 dev_put(dev);
1558         } else {
1559                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1560                 sll->sll_halen = 0;
1561         }
1562         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1563
1564         return 0;
1565 }
1566
1567 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1568                          int what)
1569 {
1570         switch (i->type) {
1571         case PACKET_MR_MULTICAST:
1572                 if (what > 0)
1573                         return dev_mc_add(dev, i->addr, i->alen, 0);
1574                 else
1575                         return dev_mc_delete(dev, i->addr, i->alen, 0);
1576                 break;
1577         case PACKET_MR_PROMISC:
1578                 return dev_set_promiscuity(dev, what);
1579                 break;
1580         case PACKET_MR_ALLMULTI:
1581                 return dev_set_allmulti(dev, what);
1582                 break;
1583         case PACKET_MR_UNICAST:
1584                 if (what > 0)
1585                         return dev_unicast_add(dev, i->addr);
1586                 else
1587                         return dev_unicast_delete(dev, i->addr);
1588                 break;
1589         default:
1590                 break;
1591         }
1592         return 0;
1593 }
1594
1595 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1596 {
1597         for ( ; i; i = i->next) {
1598                 if (i->ifindex == dev->ifindex)
1599                         packet_dev_mc(dev, i, what);
1600         }
1601 }
1602
1603 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1604 {
1605         struct packet_sock *po = pkt_sk(sk);
1606         struct packet_mclist *ml, *i;
1607         struct net_device *dev;
1608         int err;
1609
1610         rtnl_lock();
1611
1612         err = -ENODEV;
1613         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1614         if (!dev)
1615                 goto done;
1616
1617         err = -EINVAL;
1618         if (mreq->mr_alen > dev->addr_len)
1619                 goto done;
1620
1621         err = -ENOBUFS;
1622         i = kmalloc(sizeof(*i), GFP_KERNEL);
1623         if (i == NULL)
1624                 goto done;
1625
1626         err = 0;
1627         for (ml = po->mclist; ml; ml = ml->next) {
1628                 if (ml->ifindex == mreq->mr_ifindex &&
1629                     ml->type == mreq->mr_type &&
1630                     ml->alen == mreq->mr_alen &&
1631                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1632                         ml->count++;
1633                         /* Free the new element ... */
1634                         kfree(i);
1635                         goto done;
1636                 }
1637         }
1638
1639         i->type = mreq->mr_type;
1640         i->ifindex = mreq->mr_ifindex;
1641         i->alen = mreq->mr_alen;
1642         memcpy(i->addr, mreq->mr_address, i->alen);
1643         i->count = 1;
1644         i->next = po->mclist;
1645         po->mclist = i;
1646         err = packet_dev_mc(dev, i, 1);
1647         if (err) {
1648                 po->mclist = i->next;
1649                 kfree(i);
1650         }
1651
1652 done:
1653         rtnl_unlock();
1654         return err;
1655 }
1656
1657 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1658 {
1659         struct packet_mclist *ml, **mlp;
1660
1661         rtnl_lock();
1662
1663         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1664                 if (ml->ifindex == mreq->mr_ifindex &&
1665                     ml->type == mreq->mr_type &&
1666                     ml->alen == mreq->mr_alen &&
1667                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1668                         if (--ml->count == 0) {
1669                                 struct net_device *dev;
1670                                 *mlp = ml->next;
1671                                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1672                                 if (dev) {
1673                                         packet_dev_mc(dev, ml, -1);
1674                                         dev_put(dev);
1675                                 }
1676                                 kfree(ml);
1677                         }
1678                         rtnl_unlock();
1679                         return 0;
1680                 }
1681         }
1682         rtnl_unlock();
1683         return -EADDRNOTAVAIL;
1684 }
1685
1686 static void packet_flush_mclist(struct sock *sk)
1687 {
1688         struct packet_sock *po = pkt_sk(sk);
1689         struct packet_mclist *ml;
1690
1691         if (!po->mclist)
1692                 return;
1693
1694         rtnl_lock();
1695         while ((ml = po->mclist) != NULL) {
1696                 struct net_device *dev;
1697
1698                 po->mclist = ml->next;
1699                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1700                 if (dev != NULL) {
1701                         packet_dev_mc(dev, ml, -1);
1702                         dev_put(dev);
1703                 }
1704                 kfree(ml);
1705         }
1706         rtnl_unlock();
1707 }
1708
1709 static int
1710 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1711 {
1712         struct sock *sk = sock->sk;
1713         struct packet_sock *po = pkt_sk(sk);
1714         int ret;
1715
1716         if (level != SOL_PACKET)
1717                 return -ENOPROTOOPT;
1718
1719         switch (optname) {
1720         case PACKET_ADD_MEMBERSHIP:
1721         case PACKET_DROP_MEMBERSHIP:
1722         {
1723                 struct packet_mreq_max mreq;
1724                 int len = optlen;
1725                 memset(&mreq, 0, sizeof(mreq));
1726                 if (len < sizeof(struct packet_mreq))
1727                         return -EINVAL;
1728                 if (len > sizeof(mreq))
1729                         len = sizeof(mreq);
1730                 if (copy_from_user(&mreq, optval, len))
1731                         return -EFAULT;
1732                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1733                         return -EINVAL;
1734                 if (optname == PACKET_ADD_MEMBERSHIP)
1735                         ret = packet_mc_add(sk, &mreq);
1736                 else
1737                         ret = packet_mc_drop(sk, &mreq);
1738                 return ret;
1739         }
1740
1741 #ifdef CONFIG_PACKET_MMAP
1742         case PACKET_RX_RING:
1743         case PACKET_TX_RING:
1744         {
1745                 struct tpacket_req req;
1746
1747                 if (optlen < sizeof(req))
1748                         return -EINVAL;
1749                 if (copy_from_user(&req, optval, sizeof(req)))
1750                         return -EFAULT;
1751                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1752         }
1753         case PACKET_COPY_THRESH:
1754         {
1755                 int val;
1756
1757                 if (optlen != sizeof(val))
1758                         return -EINVAL;
1759                 if (copy_from_user(&val, optval, sizeof(val)))
1760                         return -EFAULT;
1761
1762                 pkt_sk(sk)->copy_thresh = val;
1763                 return 0;
1764         }
1765         case PACKET_VERSION:
1766         {
1767                 int val;
1768
1769                 if (optlen != sizeof(val))
1770                         return -EINVAL;
1771                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1772                         return -EBUSY;
1773                 if (copy_from_user(&val, optval, sizeof(val)))
1774                         return -EFAULT;
1775                 switch (val) {
1776                 case TPACKET_V1:
1777                 case TPACKET_V2:
1778                         po->tp_version = val;
1779                         return 0;
1780                 default:
1781                         return -EINVAL;
1782                 }
1783         }
1784         case PACKET_RESERVE:
1785         {
1786                 unsigned int val;
1787
1788                 if (optlen != sizeof(val))
1789                         return -EINVAL;
1790                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1791                         return -EBUSY;
1792                 if (copy_from_user(&val, optval, sizeof(val)))
1793                         return -EFAULT;
1794                 po->tp_reserve = val;
1795                 return 0;
1796         }
1797         case PACKET_LOSS:
1798         {
1799                 unsigned int val;
1800
1801                 if (optlen != sizeof(val))
1802                         return -EINVAL;
1803                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1804                         return -EBUSY;
1805                 if (copy_from_user(&val, optval, sizeof(val)))
1806                         return -EFAULT;
1807                 po->tp_loss = !!val;
1808                 return 0;
1809         }
1810 #endif
1811         case PACKET_AUXDATA:
1812         {
1813                 int val;
1814
1815                 if (optlen < sizeof(val))
1816                         return -EINVAL;
1817                 if (copy_from_user(&val, optval, sizeof(val)))
1818                         return -EFAULT;
1819
1820                 po->auxdata = !!val;
1821                 return 0;
1822         }
1823         case PACKET_ORIGDEV:
1824         {
1825                 int val;
1826
1827                 if (optlen < sizeof(val))
1828                         return -EINVAL;
1829                 if (copy_from_user(&val, optval, sizeof(val)))
1830                         return -EFAULT;
1831
1832                 po->origdev = !!val;
1833                 return 0;
1834         }
1835         default:
1836                 return -ENOPROTOOPT;
1837         }
1838 }
1839
1840 static int packet_getsockopt(struct socket *sock, int level, int optname,
1841                              char __user *optval, int __user *optlen)
1842 {
1843         int len;
1844         int val;
1845         struct sock *sk = sock->sk;
1846         struct packet_sock *po = pkt_sk(sk);
1847         void *data;
1848         struct tpacket_stats st;
1849
1850         if (level != SOL_PACKET)
1851                 return -ENOPROTOOPT;
1852
1853         if (get_user(len, optlen))
1854                 return -EFAULT;
1855
1856         if (len < 0)
1857                 return -EINVAL;
1858
1859         switch (optname) {
1860         case PACKET_STATISTICS:
1861                 if (len > sizeof(struct tpacket_stats))
1862                         len = sizeof(struct tpacket_stats);
1863                 spin_lock_bh(&sk->sk_receive_queue.lock);
1864                 st = po->stats;
1865                 memset(&po->stats, 0, sizeof(st));
1866                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1867                 st.tp_packets += st.tp_drops;
1868
1869                 data = &st;
1870                 break;
1871         case PACKET_AUXDATA:
1872                 if (len > sizeof(int))
1873                         len = sizeof(int);
1874                 val = po->auxdata;
1875
1876                 data = &val;
1877                 break;
1878         case PACKET_ORIGDEV:
1879                 if (len > sizeof(int))
1880                         len = sizeof(int);
1881                 val = po->origdev;
1882
1883                 data = &val;
1884                 break;
1885 #ifdef CONFIG_PACKET_MMAP
1886         case PACKET_VERSION:
1887                 if (len > sizeof(int))
1888                         len = sizeof(int);
1889                 val = po->tp_version;
1890                 data = &val;
1891                 break;
1892         case PACKET_HDRLEN:
1893                 if (len > sizeof(int))
1894                         len = sizeof(int);
1895                 if (copy_from_user(&val, optval, len))
1896                         return -EFAULT;
1897                 switch (val) {
1898                 case TPACKET_V1:
1899                         val = sizeof(struct tpacket_hdr);
1900                         break;
1901                 case TPACKET_V2:
1902                         val = sizeof(struct tpacket2_hdr);
1903                         break;
1904                 default:
1905                         return -EINVAL;
1906                 }
1907                 data = &val;
1908                 break;
1909         case PACKET_RESERVE:
1910                 if (len > sizeof(unsigned int))
1911                         len = sizeof(unsigned int);
1912                 val = po->tp_reserve;
1913                 data = &val;
1914                 break;
1915         case PACKET_LOSS:
1916                 if (len > sizeof(unsigned int))
1917                         len = sizeof(unsigned int);
1918                 val = po->tp_loss;
1919                 data = &val;
1920                 break;
1921 #endif
1922         default:
1923                 return -ENOPROTOOPT;
1924         }
1925
1926         if (put_user(len, optlen))
1927                 return -EFAULT;
1928         if (copy_to_user(optval, data, len))
1929                 return -EFAULT;
1930         return 0;
1931 }
1932
1933
1934 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1935 {
1936         struct sock *sk;
1937         struct hlist_node *node;
1938         struct net_device *dev = data;
1939         struct net *net = dev_net(dev);
1940
1941         read_lock(&net->packet.sklist_lock);
1942         sk_for_each(sk, node, &net->packet.sklist) {
1943                 struct packet_sock *po = pkt_sk(sk);
1944
1945                 switch (msg) {
1946                 case NETDEV_UNREGISTER:
1947                         if (po->mclist)
1948                                 packet_dev_mclist(dev, po->mclist, -1);
1949                         /* fallthrough */
1950
1951                 case NETDEV_DOWN:
1952                         if (dev->ifindex == po->ifindex) {
1953                                 spin_lock(&po->bind_lock);
1954                                 if (po->running) {
1955                                         __dev_remove_pack(&po->prot_hook);
1956                                         __sock_put(sk);
1957                                         po->running = 0;
1958                                         sk->sk_err = ENETDOWN;
1959                                         if (!sock_flag(sk, SOCK_DEAD))
1960                                                 sk->sk_error_report(sk);
1961                                 }
1962                                 if (msg == NETDEV_UNREGISTER) {
1963                                         po->ifindex = -1;
1964                                         po->prot_hook.dev = NULL;
1965                                 }
1966                                 spin_unlock(&po->bind_lock);
1967                         }
1968                         break;
1969                 case NETDEV_UP:
1970                         spin_lock(&po->bind_lock);
1971                         if (dev->ifindex == po->ifindex && po->num &&
1972                             !po->running) {
1973                                 dev_add_pack(&po->prot_hook);
1974                                 sock_hold(sk);
1975                                 po->running = 1;
1976                         }
1977                         spin_unlock(&po->bind_lock);
1978                         break;
1979                 }
1980         }
1981         read_unlock(&net->packet.sklist_lock);
1982         return NOTIFY_DONE;
1983 }
1984
1985
1986 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1987                         unsigned long arg)
1988 {
1989         struct sock *sk = sock->sk;
1990
1991         switch (cmd) {
1992         case SIOCOUTQ:
1993         {
1994                 int amount = sk_wmem_alloc_get(sk);
1995
1996                 return put_user(amount, (int __user *)arg);
1997         }
1998         case SIOCINQ:
1999         {
2000                 struct sk_buff *skb;
2001                 int amount = 0;
2002
2003                 spin_lock_bh(&sk->sk_receive_queue.lock);
2004                 skb = skb_peek(&sk->sk_receive_queue);
2005                 if (skb)
2006                         amount = skb->len;
2007                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2008                 return put_user(amount, (int __user *)arg);
2009         }
2010         case SIOCGSTAMP:
2011                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2012         case SIOCGSTAMPNS:
2013                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2014
2015 #ifdef CONFIG_INET
2016         case SIOCADDRT:
2017         case SIOCDELRT:
2018         case SIOCDARP:
2019         case SIOCGARP:
2020         case SIOCSARP:
2021         case SIOCGIFADDR:
2022         case SIOCSIFADDR:
2023         case SIOCGIFBRDADDR:
2024         case SIOCSIFBRDADDR:
2025         case SIOCGIFNETMASK:
2026         case SIOCSIFNETMASK:
2027         case SIOCGIFDSTADDR:
2028         case SIOCSIFDSTADDR:
2029         case SIOCSIFFLAGS:
2030                 if (!net_eq(sock_net(sk), &init_net))
2031                         return -ENOIOCTLCMD;
2032                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2033 #endif
2034
2035         default:
2036                 return -ENOIOCTLCMD;
2037         }
2038         return 0;
2039 }
2040
2041 #ifndef CONFIG_PACKET_MMAP
2042 #define packet_mmap sock_no_mmap
2043 #define packet_poll datagram_poll
2044 #else
2045
2046 static unsigned int packet_poll(struct file *file, struct socket *sock,
2047                                 poll_table *wait)
2048 {
2049         struct sock *sk = sock->sk;
2050         struct packet_sock *po = pkt_sk(sk);
2051         unsigned int mask = datagram_poll(file, sock, wait);
2052
2053         spin_lock_bh(&sk->sk_receive_queue.lock);
2054         if (po->rx_ring.pg_vec) {
2055                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2056                         mask |= POLLIN | POLLRDNORM;
2057         }
2058         spin_unlock_bh(&sk->sk_receive_queue.lock);
2059         spin_lock_bh(&sk->sk_write_queue.lock);
2060         if (po->tx_ring.pg_vec) {
2061                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2062                         mask |= POLLOUT | POLLWRNORM;
2063         }
2064         spin_unlock_bh(&sk->sk_write_queue.lock);
2065         return mask;
2066 }
2067
2068
2069 /* Dirty? Well, I still did not learn better way to account
2070  * for user mmaps.
2071  */
2072
2073 static void packet_mm_open(struct vm_area_struct *vma)
2074 {
2075         struct file *file = vma->vm_file;
2076         struct socket *sock = file->private_data;
2077         struct sock *sk = sock->sk;
2078
2079         if (sk)
2080                 atomic_inc(&pkt_sk(sk)->mapped);
2081 }
2082
2083 static void packet_mm_close(struct vm_area_struct *vma)
2084 {
2085         struct file *file = vma->vm_file;
2086         struct socket *sock = file->private_data;
2087         struct sock *sk = sock->sk;
2088
2089         if (sk)
2090                 atomic_dec(&pkt_sk(sk)->mapped);
2091 }
2092
2093 static const struct vm_operations_struct packet_mmap_ops = {
2094         .open   =       packet_mm_open,
2095         .close  =       packet_mm_close,
2096 };
2097
2098 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2099 {
2100         int i;
2101
2102         for (i = 0; i < len; i++) {
2103                 if (likely(pg_vec[i]))
2104                         free_pages((unsigned long) pg_vec[i], order);
2105         }
2106         kfree(pg_vec);
2107 }
2108
2109 static inline char *alloc_one_pg_vec_page(unsigned long order)
2110 {
2111         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2112
2113         return (char *) __get_free_pages(gfp_flags, order);
2114 }
2115
2116 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2117 {
2118         unsigned int block_nr = req->tp_block_nr;
2119         char **pg_vec;
2120         int i;
2121
2122         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2123         if (unlikely(!pg_vec))
2124                 goto out;
2125
2126         for (i = 0; i < block_nr; i++) {
2127                 pg_vec[i] = alloc_one_pg_vec_page(order);
2128                 if (unlikely(!pg_vec[i]))
2129                         goto out_free_pgvec;
2130         }
2131
2132 out:
2133         return pg_vec;
2134
2135 out_free_pgvec:
2136         free_pg_vec(pg_vec, order, block_nr);
2137         pg_vec = NULL;
2138         goto out;
2139 }
2140
2141 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2142                 int closing, int tx_ring)
2143 {
2144         char **pg_vec = NULL;
2145         struct packet_sock *po = pkt_sk(sk);
2146         int was_running, order = 0;
2147         struct packet_ring_buffer *rb;
2148         struct sk_buff_head *rb_queue;
2149         __be16 num;
2150         int err;
2151
2152         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2153         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2154
2155         err = -EBUSY;
2156         if (!closing) {
2157                 if (atomic_read(&po->mapped))
2158                         goto out;
2159                 if (atomic_read(&rb->pending))
2160                         goto out;
2161         }
2162
2163         if (req->tp_block_nr) {
2164                 /* Sanity tests and some calculations */
2165                 err = -EBUSY;
2166                 if (unlikely(rb->pg_vec))
2167                         goto out;
2168
2169                 switch (po->tp_version) {
2170                 case TPACKET_V1:
2171                         po->tp_hdrlen = TPACKET_HDRLEN;
2172                         break;
2173                 case TPACKET_V2:
2174                         po->tp_hdrlen = TPACKET2_HDRLEN;
2175                         break;
2176                 }
2177
2178                 err = -EINVAL;
2179                 if (unlikely((int)req->tp_block_size <= 0))
2180                         goto out;
2181                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2182                         goto out;
2183                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2184                                         po->tp_reserve))
2185                         goto out;
2186                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2187                         goto out;
2188
2189                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2190                 if (unlikely(rb->frames_per_block <= 0))
2191                         goto out;
2192                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2193                                         req->tp_frame_nr))
2194                         goto out;
2195
2196                 err = -ENOMEM;
2197                 order = get_order(req->tp_block_size);
2198                 pg_vec = alloc_pg_vec(req, order);
2199                 if (unlikely(!pg_vec))
2200                         goto out;
2201         }
2202         /* Done */
2203         else {
2204                 err = -EINVAL;
2205                 if (unlikely(req->tp_frame_nr))
2206                         goto out;
2207         }
2208
2209         lock_sock(sk);
2210
2211         /* Detach socket from network */
2212         spin_lock(&po->bind_lock);
2213         was_running = po->running;
2214         num = po->num;
2215         if (was_running) {
2216                 __dev_remove_pack(&po->prot_hook);
2217                 po->num = 0;
2218                 po->running = 0;
2219                 __sock_put(sk);
2220         }
2221         spin_unlock(&po->bind_lock);
2222
2223         synchronize_net();
2224
2225         err = -EBUSY;
2226         mutex_lock(&po->pg_vec_lock);
2227         if (closing || atomic_read(&po->mapped) == 0) {
2228                 err = 0;
2229 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2230                 spin_lock_bh(&rb_queue->lock);
2231                 pg_vec = XC(rb->pg_vec, pg_vec);
2232                 rb->frame_max = (req->tp_frame_nr - 1);
2233                 rb->head = 0;
2234                 rb->frame_size = req->tp_frame_size;
2235                 spin_unlock_bh(&rb_queue->lock);
2236
2237                 order = XC(rb->pg_vec_order, order);
2238                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2239
2240                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2241                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2242                                                 tpacket_rcv : packet_rcv;
2243                 skb_queue_purge(rb_queue);
2244 #undef XC
2245                 if (atomic_read(&po->mapped))
2246                         pr_err("packet_mmap: vma is busy: %d\n",
2247                                atomic_read(&po->mapped));
2248         }
2249         mutex_unlock(&po->pg_vec_lock);
2250
2251         spin_lock(&po->bind_lock);
2252         if (was_running && !po->running) {
2253                 sock_hold(sk);
2254                 po->running = 1;
2255                 po->num = num;
2256                 dev_add_pack(&po->prot_hook);
2257         }
2258         spin_unlock(&po->bind_lock);
2259
2260         release_sock(sk);
2261
2262         if (pg_vec)
2263                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2264 out:
2265         return err;
2266 }
2267
2268 static int packet_mmap(struct file *file, struct socket *sock,
2269                 struct vm_area_struct *vma)
2270 {
2271         struct sock *sk = sock->sk;
2272         struct packet_sock *po = pkt_sk(sk);
2273         unsigned long size, expected_size;
2274         struct packet_ring_buffer *rb;
2275         unsigned long start;
2276         int err = -EINVAL;
2277         int i;
2278
2279         if (vma->vm_pgoff)
2280                 return -EINVAL;
2281
2282         mutex_lock(&po->pg_vec_lock);
2283
2284         expected_size = 0;
2285         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2286                 if (rb->pg_vec) {
2287                         expected_size += rb->pg_vec_len
2288                                                 * rb->pg_vec_pages
2289                                                 * PAGE_SIZE;
2290                 }
2291         }
2292
2293         if (expected_size == 0)
2294                 goto out;
2295
2296         size = vma->vm_end - vma->vm_start;
2297         if (size != expected_size)
2298                 goto out;
2299
2300         start = vma->vm_start;
2301         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2302                 if (rb->pg_vec == NULL)
2303                         continue;
2304
2305                 for (i = 0; i < rb->pg_vec_len; i++) {
2306                         struct page *page = virt_to_page(rb->pg_vec[i]);
2307                         int pg_num;
2308
2309                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2310                                         pg_num++, page++) {
2311                                 err = vm_insert_page(vma, start, page);
2312                                 if (unlikely(err))
2313                                         goto out;
2314                                 start += PAGE_SIZE;
2315                         }
2316                 }
2317         }
2318
2319         atomic_inc(&po->mapped);
2320         vma->vm_ops = &packet_mmap_ops;
2321         err = 0;
2322
2323 out:
2324         mutex_unlock(&po->pg_vec_lock);
2325         return err;
2326 }
2327 #endif
2328
2329
2330 static const struct proto_ops packet_ops_spkt = {
2331         .family =       PF_PACKET,
2332         .owner =        THIS_MODULE,
2333         .release =      packet_release,
2334         .bind =         packet_bind_spkt,
2335         .connect =      sock_no_connect,
2336         .socketpair =   sock_no_socketpair,
2337         .accept =       sock_no_accept,
2338         .getname =      packet_getname_spkt,
2339         .poll =         datagram_poll,
2340         .ioctl =        packet_ioctl,
2341         .listen =       sock_no_listen,
2342         .shutdown =     sock_no_shutdown,
2343         .setsockopt =   sock_no_setsockopt,
2344         .getsockopt =   sock_no_getsockopt,
2345         .sendmsg =      packet_sendmsg_spkt,
2346         .recvmsg =      packet_recvmsg,
2347         .mmap =         sock_no_mmap,
2348         .sendpage =     sock_no_sendpage,
2349 };
2350
2351 static const struct proto_ops packet_ops = {
2352         .family =       PF_PACKET,
2353         .owner =        THIS_MODULE,
2354         .release =      packet_release,
2355         .bind =         packet_bind,
2356         .connect =      sock_no_connect,
2357         .socketpair =   sock_no_socketpair,
2358         .accept =       sock_no_accept,
2359         .getname =      packet_getname,
2360         .poll =         packet_poll,
2361         .ioctl =        packet_ioctl,
2362         .listen =       sock_no_listen,
2363         .shutdown =     sock_no_shutdown,
2364         .setsockopt =   packet_setsockopt,
2365         .getsockopt =   packet_getsockopt,
2366         .sendmsg =      packet_sendmsg,
2367         .recvmsg =      packet_recvmsg,
2368         .mmap =         packet_mmap,
2369         .sendpage =     sock_no_sendpage,
2370 };
2371
2372 static struct net_proto_family packet_family_ops = {
2373         .family =       PF_PACKET,
2374         .create =       packet_create,
2375         .owner  =       THIS_MODULE,
2376 };
2377
2378 static struct notifier_block packet_netdev_notifier = {
2379         .notifier_call =        packet_notifier,
2380 };
2381
2382 #ifdef CONFIG_PROC_FS
2383 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2384 {
2385         struct sock *s;
2386         struct hlist_node *node;
2387
2388         sk_for_each(s, node, &net->packet.sklist) {
2389                 if (!off--)
2390                         return s;
2391         }
2392         return NULL;
2393 }
2394
2395 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2396         __acquires(seq_file_net(seq)->packet.sklist_lock)
2397 {
2398         struct net *net = seq_file_net(seq);
2399         read_lock(&net->packet.sklist_lock);
2400         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2401 }
2402
2403 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2404 {
2405         struct net *net = seq_file_net(seq);
2406         ++*pos;
2407         return  (v == SEQ_START_TOKEN)
2408                 ? sk_head(&net->packet.sklist)
2409                 : sk_next((struct sock *)v) ;
2410 }
2411
2412 static void packet_seq_stop(struct seq_file *seq, void *v)
2413         __releases(seq_file_net(seq)->packet.sklist_lock)
2414 {
2415         struct net *net = seq_file_net(seq);
2416         read_unlock(&net->packet.sklist_lock);
2417 }
2418
2419 static int packet_seq_show(struct seq_file *seq, void *v)
2420 {
2421         if (v == SEQ_START_TOKEN)
2422                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2423         else {
2424                 struct sock *s = v;
2425                 const struct packet_sock *po = pkt_sk(s);
2426
2427                 seq_printf(seq,
2428                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2429                            s,
2430                            atomic_read(&s->sk_refcnt),
2431                            s->sk_type,
2432                            ntohs(po->num),
2433                            po->ifindex,
2434                            po->running,
2435                            atomic_read(&s->sk_rmem_alloc),
2436                            sock_i_uid(s),
2437                            sock_i_ino(s));
2438         }
2439
2440         return 0;
2441 }
2442
2443 static const struct seq_operations packet_seq_ops = {
2444         .start  = packet_seq_start,
2445         .next   = packet_seq_next,
2446         .stop   = packet_seq_stop,
2447         .show   = packet_seq_show,
2448 };
2449
2450 static int packet_seq_open(struct inode *inode, struct file *file)
2451 {
2452         return seq_open_net(inode, file, &packet_seq_ops,
2453                             sizeof(struct seq_net_private));
2454 }
2455
2456 static const struct file_operations packet_seq_fops = {
2457         .owner          = THIS_MODULE,
2458         .open           = packet_seq_open,
2459         .read           = seq_read,
2460         .llseek         = seq_lseek,
2461         .release        = seq_release_net,
2462 };
2463
2464 #endif
2465
2466 static int packet_net_init(struct net *net)
2467 {
2468         rwlock_init(&net->packet.sklist_lock);
2469         INIT_HLIST_HEAD(&net->packet.sklist);
2470
2471         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2472                 return -ENOMEM;
2473
2474         return 0;
2475 }
2476
2477 static void packet_net_exit(struct net *net)
2478 {
2479         proc_net_remove(net, "packet");
2480 }
2481
2482 static struct pernet_operations packet_net_ops = {
2483         .init = packet_net_init,
2484         .exit = packet_net_exit,
2485 };
2486
2487
2488 static void __exit packet_exit(void)
2489 {
2490         unregister_netdevice_notifier(&packet_netdev_notifier);
2491         unregister_pernet_subsys(&packet_net_ops);
2492         sock_unregister(PF_PACKET);
2493         proto_unregister(&packet_proto);
2494 }
2495
2496 static int __init packet_init(void)
2497 {
2498         int rc = proto_register(&packet_proto, 0);
2499
2500         if (rc != 0)
2501                 goto out;
2502
2503         sock_register(&packet_family_ops);
2504         register_pernet_subsys(&packet_net_ops);
2505         register_netdevice_notifier(&packet_netdev_notifier);
2506 out:
2507         return rc;
2508 }
2509
2510 module_init(packet_init);
2511 module_exit(packet_exit);
2512 MODULE_LICENSE("GPL");
2513 MODULE_ALIAS_NETPROTO(PF_PACKET);