2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/module.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <net/wext.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
124 #include <linux/ipv6.h>
125 #include <linux/in.h>
126 #include <linux/jhash.h>
127 #include <linux/random.h>
128 #include <trace/events/napi.h>
129 #include <trace/events/net.h>
130 #include <trace/events/skb.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
136 #include "net-sysfs.h"
138 /* Instead of increasing this, you should create a hash table. */
139 #define MAX_GRO_SKBS 8
141 /* This should be increased if a protocol with a bigger head is added. */
142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
145 * The list of packet types we will receive (as opposed to discard)
146 * and the routines to invoke.
148 * Why 16. Because with 16 the only overlap we get on a hash of the
149 * low nibble of the protocol value is RARP/SNAP/X.25.
151 * NOTE: That is no longer true with the addition of VLAN tags. Not
152 * sure which should go first, but I bet it won't make much
153 * difference if we are running VLANs. The good news is that
154 * this protocol won't be in the list unless compiled in, so
155 * the average user (w/out VLANs) will not be adversely affected.
172 #define PTYPE_HASH_SIZE (16)
173 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
175 static DEFINE_SPINLOCK(ptype_lock);
176 static DEFINE_SPINLOCK(offload_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly; /* Taps */
179 static struct list_head offload_base __read_mostly;
182 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
185 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
187 * Writers must hold the rtnl semaphore while they loop through the
188 * dev_base_head list, and hold dev_base_lock for writing when they do the
189 * actual updates. This allows pure readers to access the list even
190 * while a writer is preparing to update it.
192 * To put it another way, dev_base_lock is held for writing only to
193 * protect against pure readers; the rtnl semaphore provides the
194 * protection against other writers.
196 * See, for example usages, register_netdevice() and
197 * unregister_netdevice(), which must be called with the rtnl
200 DEFINE_RWLOCK(dev_base_lock);
201 EXPORT_SYMBOL(dev_base_lock);
203 seqcount_t devnet_rename_seq;
205 static inline void dev_base_seq_inc(struct net *net)
207 while (++net->dev_base_seq == 0);
210 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
212 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
214 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
217 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
219 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
222 static inline void rps_lock(struct softnet_data *sd)
225 spin_lock(&sd->input_pkt_queue.lock);
229 static inline void rps_unlock(struct softnet_data *sd)
232 spin_unlock(&sd->input_pkt_queue.lock);
236 /* Device list insertion */
237 static int list_netdevice(struct net_device *dev)
239 struct net *net = dev_net(dev);
243 write_lock_bh(&dev_base_lock);
244 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
245 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
246 hlist_add_head_rcu(&dev->index_hlist,
247 dev_index_hash(net, dev->ifindex));
248 write_unlock_bh(&dev_base_lock);
250 dev_base_seq_inc(net);
255 /* Device list removal
256 * caller must respect a RCU grace period before freeing/reusing dev
258 static void unlist_netdevice(struct net_device *dev)
262 /* Unlink dev from the device chain */
263 write_lock_bh(&dev_base_lock);
264 list_del_rcu(&dev->dev_list);
265 hlist_del_rcu(&dev->name_hlist);
266 hlist_del_rcu(&dev->index_hlist);
267 write_unlock_bh(&dev_base_lock);
269 dev_base_seq_inc(dev_net(dev));
276 static RAW_NOTIFIER_HEAD(netdev_chain);
279 * Device drivers call our routines to queue packets here. We empty the
280 * queue in the local softnet handler.
283 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
284 EXPORT_PER_CPU_SYMBOL(softnet_data);
286 #ifdef CONFIG_LOCKDEP
288 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
289 * according to dev->type
291 static const unsigned short netdev_lock_type[] =
292 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
293 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
294 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
295 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
296 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
297 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
298 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
299 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
300 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
301 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
302 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
303 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
304 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
305 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
306 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
308 static const char *const netdev_lock_name[] =
309 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
310 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
311 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
312 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
313 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
314 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
315 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
316 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
317 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
318 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
319 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
320 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
321 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
322 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
323 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
325 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
326 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
328 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
332 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
333 if (netdev_lock_type[i] == dev_type)
335 /* the last key is used by default */
336 return ARRAY_SIZE(netdev_lock_type) - 1;
339 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
340 unsigned short dev_type)
344 i = netdev_lock_pos(dev_type);
345 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
346 netdev_lock_name[i]);
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
353 i = netdev_lock_pos(dev->type);
354 lockdep_set_class_and_name(&dev->addr_list_lock,
355 &netdev_addr_lock_key[i],
356 netdev_lock_name[i]);
359 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
360 unsigned short dev_type)
363 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
368 /*******************************************************************************
370 Protocol management and registration routines
372 *******************************************************************************/
375 * Add a protocol ID to the list. Now that the input handler is
376 * smarter we can dispense with all the messy stuff that used to be
379 * BEWARE!!! Protocol handlers, mangling input packets,
380 * MUST BE last in hash buckets and checking protocol handlers
381 * MUST start from promiscuous ptype_all chain in net_bh.
382 * It is true now, do not change it.
383 * Explanation follows: if protocol handler, mangling packet, will
384 * be the first on list, it is not able to sense, that packet
385 * is cloned and should be copied-on-write, so that it will
386 * change it and subsequent readers will get broken packet.
390 static inline struct list_head *ptype_head(const struct packet_type *pt)
392 if (pt->type == htons(ETH_P_ALL))
395 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
399 * dev_add_pack - add packet handler
400 * @pt: packet type declaration
402 * Add a protocol handler to the networking stack. The passed &packet_type
403 * is linked into kernel lists and may not be freed until it has been
404 * removed from the kernel lists.
406 * This call does not sleep therefore it can not
407 * guarantee all CPU's that are in middle of receiving packets
408 * will see the new packet type (until the next received packet).
411 void dev_add_pack(struct packet_type *pt)
413 struct list_head *head = ptype_head(pt);
415 spin_lock(&ptype_lock);
416 list_add_rcu(&pt->list, head);
417 spin_unlock(&ptype_lock);
419 EXPORT_SYMBOL(dev_add_pack);
422 * __dev_remove_pack - remove packet handler
423 * @pt: packet type declaration
425 * Remove a protocol handler that was previously added to the kernel
426 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
427 * from the kernel lists and can be freed or reused once this function
430 * The packet type might still be in use by receivers
431 * and must not be freed until after all the CPU's have gone
432 * through a quiescent state.
434 void __dev_remove_pack(struct packet_type *pt)
436 struct list_head *head = ptype_head(pt);
437 struct packet_type *pt1;
439 spin_lock(&ptype_lock);
441 list_for_each_entry(pt1, head, list) {
443 list_del_rcu(&pt->list);
448 pr_warn("dev_remove_pack: %p not found\n", pt);
450 spin_unlock(&ptype_lock);
452 EXPORT_SYMBOL(__dev_remove_pack);
455 * dev_remove_pack - remove packet handler
456 * @pt: packet type declaration
458 * Remove a protocol handler that was previously added to the kernel
459 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
460 * from the kernel lists and can be freed or reused once this function
463 * This call sleeps to guarantee that no CPU is looking at the packet
466 void dev_remove_pack(struct packet_type *pt)
468 __dev_remove_pack(pt);
472 EXPORT_SYMBOL(dev_remove_pack);
476 * dev_add_offload - register offload handlers
477 * @po: protocol offload declaration
479 * Add protocol offload handlers to the networking stack. The passed
480 * &proto_offload is linked into kernel lists and may not be freed until
481 * it has been removed from the kernel lists.
483 * This call does not sleep therefore it can not
484 * guarantee all CPU's that are in middle of receiving packets
485 * will see the new offload handlers (until the next received packet).
487 void dev_add_offload(struct packet_offload *po)
489 struct list_head *head = &offload_base;
491 spin_lock(&offload_lock);
492 list_add_rcu(&po->list, head);
493 spin_unlock(&offload_lock);
495 EXPORT_SYMBOL(dev_add_offload);
498 * __dev_remove_offload - remove offload handler
499 * @po: packet offload declaration
501 * Remove a protocol offload handler that was previously added to the
502 * kernel offload handlers by dev_add_offload(). The passed &offload_type
503 * is removed from the kernel lists and can be freed or reused once this
506 * The packet type might still be in use by receivers
507 * and must not be freed until after all the CPU's have gone
508 * through a quiescent state.
510 void __dev_remove_offload(struct packet_offload *po)
512 struct list_head *head = &offload_base;
513 struct packet_offload *po1;
515 spin_lock(&offload_lock);
517 list_for_each_entry(po1, head, list) {
519 list_del_rcu(&po->list);
524 pr_warn("dev_remove_offload: %p not found\n", po);
526 spin_unlock(&offload_lock);
528 EXPORT_SYMBOL(__dev_remove_offload);
531 * dev_remove_offload - remove packet offload handler
532 * @po: packet offload declaration
534 * Remove a packet offload handler that was previously added to the kernel
535 * offload handlers by dev_add_offload(). The passed &offload_type is
536 * removed from the kernel lists and can be freed or reused once this
539 * This call sleeps to guarantee that no CPU is looking at the packet
542 void dev_remove_offload(struct packet_offload *po)
544 __dev_remove_offload(po);
548 EXPORT_SYMBOL(dev_remove_offload);
550 /******************************************************************************
552 Device Boot-time Settings Routines
554 *******************************************************************************/
556 /* Boot time configuration table */
557 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
560 * netdev_boot_setup_add - add new setup entry
561 * @name: name of the device
562 * @map: configured settings for the device
564 * Adds new setup entry to the dev_boot_setup list. The function
565 * returns 0 on error and 1 on success. This is a generic routine to
568 static int netdev_boot_setup_add(char *name, struct ifmap *map)
570 struct netdev_boot_setup *s;
574 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
575 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
576 memset(s[i].name, 0, sizeof(s[i].name));
577 strlcpy(s[i].name, name, IFNAMSIZ);
578 memcpy(&s[i].map, map, sizeof(s[i].map));
583 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
587 * netdev_boot_setup_check - check boot time settings
588 * @dev: the netdevice
590 * Check boot time settings for the device.
591 * The found settings are set for the device to be used
592 * later in the device probing.
593 * Returns 0 if no settings found, 1 if they are.
595 int netdev_boot_setup_check(struct net_device *dev)
597 struct netdev_boot_setup *s = dev_boot_setup;
600 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
601 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
602 !strcmp(dev->name, s[i].name)) {
603 dev->irq = s[i].map.irq;
604 dev->base_addr = s[i].map.base_addr;
605 dev->mem_start = s[i].map.mem_start;
606 dev->mem_end = s[i].map.mem_end;
612 EXPORT_SYMBOL(netdev_boot_setup_check);
616 * netdev_boot_base - get address from boot time settings
617 * @prefix: prefix for network device
618 * @unit: id for network device
620 * Check boot time settings for the base address of device.
621 * The found settings are set for the device to be used
622 * later in the device probing.
623 * Returns 0 if no settings found.
625 unsigned long netdev_boot_base(const char *prefix, int unit)
627 const struct netdev_boot_setup *s = dev_boot_setup;
631 sprintf(name, "%s%d", prefix, unit);
634 * If device already registered then return base of 1
635 * to indicate not to probe for this interface
637 if (__dev_get_by_name(&init_net, name))
640 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
641 if (!strcmp(name, s[i].name))
642 return s[i].map.base_addr;
647 * Saves at boot time configured settings for any netdevice.
649 int __init netdev_boot_setup(char *str)
654 str = get_options(str, ARRAY_SIZE(ints), ints);
659 memset(&map, 0, sizeof(map));
663 map.base_addr = ints[2];
665 map.mem_start = ints[3];
667 map.mem_end = ints[4];
669 /* Add new entry to the list */
670 return netdev_boot_setup_add(str, &map);
673 __setup("netdev=", netdev_boot_setup);
675 /*******************************************************************************
677 Device Interface Subroutines
679 *******************************************************************************/
682 * __dev_get_by_name - find a device by its name
683 * @net: the applicable net namespace
684 * @name: name to find
686 * Find an interface by name. Must be called under RTNL semaphore
687 * or @dev_base_lock. If the name is found a pointer to the device
688 * is returned. If the name is not found then %NULL is returned. The
689 * reference counters are not incremented so the caller must be
690 * careful with locks.
693 struct net_device *__dev_get_by_name(struct net *net, const char *name)
695 struct hlist_node *p;
696 struct net_device *dev;
697 struct hlist_head *head = dev_name_hash(net, name);
699 hlist_for_each_entry(dev, p, head, name_hlist)
700 if (!strncmp(dev->name, name, IFNAMSIZ))
705 EXPORT_SYMBOL(__dev_get_by_name);
708 * dev_get_by_name_rcu - find a device by its name
709 * @net: the applicable net namespace
710 * @name: name to find
712 * Find an interface by name.
713 * If the name is found a pointer to the device is returned.
714 * If the name is not found then %NULL is returned.
715 * The reference counters are not incremented so the caller must be
716 * careful with locks. The caller must hold RCU lock.
719 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
721 struct hlist_node *p;
722 struct net_device *dev;
723 struct hlist_head *head = dev_name_hash(net, name);
725 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
726 if (!strncmp(dev->name, name, IFNAMSIZ))
731 EXPORT_SYMBOL(dev_get_by_name_rcu);
734 * dev_get_by_name - find a device by its name
735 * @net: the applicable net namespace
736 * @name: name to find
738 * Find an interface by name. This can be called from any
739 * context and does its own locking. The returned handle has
740 * the usage count incremented and the caller must use dev_put() to
741 * release it when it is no longer needed. %NULL is returned if no
742 * matching device is found.
745 struct net_device *dev_get_by_name(struct net *net, const char *name)
747 struct net_device *dev;
750 dev = dev_get_by_name_rcu(net, name);
756 EXPORT_SYMBOL(dev_get_by_name);
759 * __dev_get_by_index - find a device by its ifindex
760 * @net: the applicable net namespace
761 * @ifindex: index of device
763 * Search for an interface by index. Returns %NULL if the device
764 * is not found or a pointer to the device. The device has not
765 * had its reference counter increased so the caller must be careful
766 * about locking. The caller must hold either the RTNL semaphore
770 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
772 struct hlist_node *p;
773 struct net_device *dev;
774 struct hlist_head *head = dev_index_hash(net, ifindex);
776 hlist_for_each_entry(dev, p, head, index_hlist)
777 if (dev->ifindex == ifindex)
782 EXPORT_SYMBOL(__dev_get_by_index);
785 * dev_get_by_index_rcu - find a device by its ifindex
786 * @net: the applicable net namespace
787 * @ifindex: index of device
789 * Search for an interface by index. Returns %NULL if the device
790 * is not found or a pointer to the device. The device has not
791 * had its reference counter increased so the caller must be careful
792 * about locking. The caller must hold RCU lock.
795 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
797 struct hlist_node *p;
798 struct net_device *dev;
799 struct hlist_head *head = dev_index_hash(net, ifindex);
801 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
802 if (dev->ifindex == ifindex)
807 EXPORT_SYMBOL(dev_get_by_index_rcu);
811 * dev_get_by_index - find a device by its ifindex
812 * @net: the applicable net namespace
813 * @ifindex: index of device
815 * Search for an interface by index. Returns NULL if the device
816 * is not found or a pointer to the device. The device returned has
817 * had a reference added and the pointer is safe until the user calls
818 * dev_put to indicate they have finished with it.
821 struct net_device *dev_get_by_index(struct net *net, int ifindex)
823 struct net_device *dev;
826 dev = dev_get_by_index_rcu(net, ifindex);
832 EXPORT_SYMBOL(dev_get_by_index);
835 * dev_getbyhwaddr_rcu - find a device by its hardware address
836 * @net: the applicable net namespace
837 * @type: media type of device
838 * @ha: hardware address
840 * Search for an interface by MAC address. Returns NULL if the device
841 * is not found or a pointer to the device.
842 * The caller must hold RCU or RTNL.
843 * The returned device has not had its ref count increased
844 * and the caller must therefore be careful about locking
848 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
851 struct net_device *dev;
853 for_each_netdev_rcu(net, dev)
854 if (dev->type == type &&
855 !memcmp(dev->dev_addr, ha, dev->addr_len))
860 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
862 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
864 struct net_device *dev;
867 for_each_netdev(net, dev)
868 if (dev->type == type)
873 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
875 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
877 struct net_device *dev, *ret = NULL;
880 for_each_netdev_rcu(net, dev)
881 if (dev->type == type) {
889 EXPORT_SYMBOL(dev_getfirstbyhwtype);
892 * dev_get_by_flags_rcu - find any device with given flags
893 * @net: the applicable net namespace
894 * @if_flags: IFF_* values
895 * @mask: bitmask of bits in if_flags to check
897 * Search for any interface with the given flags. Returns NULL if a device
898 * is not found or a pointer to the device. Must be called inside
899 * rcu_read_lock(), and result refcount is unchanged.
902 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
905 struct net_device *dev, *ret;
908 for_each_netdev_rcu(net, dev) {
909 if (((dev->flags ^ if_flags) & mask) == 0) {
916 EXPORT_SYMBOL(dev_get_by_flags_rcu);
919 * dev_valid_name - check if name is okay for network device
922 * Network device names need to be valid file names to
923 * to allow sysfs to work. We also disallow any kind of
926 bool dev_valid_name(const char *name)
930 if (strlen(name) >= IFNAMSIZ)
932 if (!strcmp(name, ".") || !strcmp(name, ".."))
936 if (*name == '/' || isspace(*name))
942 EXPORT_SYMBOL(dev_valid_name);
945 * __dev_alloc_name - allocate a name for a device
946 * @net: network namespace to allocate the device name in
947 * @name: name format string
948 * @buf: scratch buffer and result name string
950 * Passed a format string - eg "lt%d" it will try and find a suitable
951 * id. It scans list of devices to build up a free map, then chooses
952 * the first empty slot. The caller must hold the dev_base or rtnl lock
953 * while allocating the name and adding the device in order to avoid
955 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
956 * Returns the number of the unit assigned or a negative errno code.
959 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
963 const int max_netdevices = 8*PAGE_SIZE;
964 unsigned long *inuse;
965 struct net_device *d;
967 p = strnchr(name, IFNAMSIZ-1, '%');
970 * Verify the string as this thing may have come from
971 * the user. There must be either one "%d" and no other "%"
974 if (p[1] != 'd' || strchr(p + 2, '%'))
977 /* Use one page as a bit array of possible slots */
978 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
982 for_each_netdev(net, d) {
983 if (!sscanf(d->name, name, &i))
985 if (i < 0 || i >= max_netdevices)
988 /* avoid cases where sscanf is not exact inverse of printf */
989 snprintf(buf, IFNAMSIZ, name, i);
990 if (!strncmp(buf, d->name, IFNAMSIZ))
994 i = find_first_zero_bit(inuse, max_netdevices);
995 free_page((unsigned long) inuse);
999 snprintf(buf, IFNAMSIZ, name, i);
1000 if (!__dev_get_by_name(net, buf))
1003 /* It is possible to run out of possible slots
1004 * when the name is long and there isn't enough space left
1005 * for the digits, or if all bits are used.
1011 * dev_alloc_name - allocate a name for a device
1013 * @name: name format string
1015 * Passed a format string - eg "lt%d" it will try and find a suitable
1016 * id. It scans list of devices to build up a free map, then chooses
1017 * the first empty slot. The caller must hold the dev_base or rtnl lock
1018 * while allocating the name and adding the device in order to avoid
1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 * Returns the number of the unit assigned or a negative errno code.
1024 int dev_alloc_name(struct net_device *dev, const char *name)
1030 BUG_ON(!dev_net(dev));
1032 ret = __dev_alloc_name(net, name, buf);
1034 strlcpy(dev->name, buf, IFNAMSIZ);
1037 EXPORT_SYMBOL(dev_alloc_name);
1039 static int dev_alloc_name_ns(struct net *net,
1040 struct net_device *dev,
1046 ret = __dev_alloc_name(net, name, buf);
1048 strlcpy(dev->name, buf, IFNAMSIZ);
1052 static int dev_get_valid_name(struct net *net,
1053 struct net_device *dev,
1058 if (!dev_valid_name(name))
1061 if (strchr(name, '%'))
1062 return dev_alloc_name_ns(net, dev, name);
1063 else if (__dev_get_by_name(net, name))
1065 else if (dev->name != name)
1066 strlcpy(dev->name, name, IFNAMSIZ);
1072 * dev_change_name - change name of a device
1074 * @newname: name (or format string) must be at least IFNAMSIZ
1076 * Change name of a device, can pass format strings "eth%d".
1079 int dev_change_name(struct net_device *dev, const char *newname)
1081 char oldname[IFNAMSIZ];
1087 BUG_ON(!dev_net(dev));
1090 if (dev->flags & IFF_UP)
1093 write_seqcount_begin(&devnet_rename_seq);
1095 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1096 write_seqcount_end(&devnet_rename_seq);
1100 memcpy(oldname, dev->name, IFNAMSIZ);
1102 err = dev_get_valid_name(net, dev, newname);
1104 write_seqcount_end(&devnet_rename_seq);
1109 ret = device_rename(&dev->dev, dev->name);
1111 memcpy(dev->name, oldname, IFNAMSIZ);
1112 write_seqcount_end(&devnet_rename_seq);
1116 write_seqcount_end(&devnet_rename_seq);
1118 write_lock_bh(&dev_base_lock);
1119 hlist_del_rcu(&dev->name_hlist);
1120 write_unlock_bh(&dev_base_lock);
1124 write_lock_bh(&dev_base_lock);
1125 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1126 write_unlock_bh(&dev_base_lock);
1128 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1129 ret = notifier_to_errno(ret);
1132 /* err >= 0 after dev_alloc_name() or stores the first errno */
1135 write_seqcount_begin(&devnet_rename_seq);
1136 memcpy(dev->name, oldname, IFNAMSIZ);
1139 pr_err("%s: name change rollback failed: %d\n",
1148 * dev_set_alias - change ifalias of a device
1150 * @alias: name up to IFALIASZ
1151 * @len: limit of bytes to copy from info
1153 * Set ifalias for a device,
1155 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1161 if (len >= IFALIASZ)
1165 kfree(dev->ifalias);
1166 dev->ifalias = NULL;
1170 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1173 dev->ifalias = new_ifalias;
1175 strlcpy(dev->ifalias, alias, len+1);
1181 * netdev_features_change - device changes features
1182 * @dev: device to cause notification
1184 * Called to indicate a device has changed features.
1186 void netdev_features_change(struct net_device *dev)
1188 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1190 EXPORT_SYMBOL(netdev_features_change);
1193 * netdev_state_change - device changes state
1194 * @dev: device to cause notification
1196 * Called to indicate a device has changed state. This function calls
1197 * the notifier chains for netdev_chain and sends a NEWLINK message
1198 * to the routing socket.
1200 void netdev_state_change(struct net_device *dev)
1202 if (dev->flags & IFF_UP) {
1203 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1204 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1207 EXPORT_SYMBOL(netdev_state_change);
1210 * netdev_notify_peers - notify network peers about existence of @dev
1211 * @dev: network device
1213 * Generate traffic such that interested network peers are aware of
1214 * @dev, such as by generating a gratuitous ARP. This may be used when
1215 * a device wants to inform the rest of the network about some sort of
1216 * reconfiguration such as a failover event or virtual machine
1219 void netdev_notify_peers(struct net_device *dev)
1222 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1225 EXPORT_SYMBOL(netdev_notify_peers);
1227 static int __dev_open(struct net_device *dev)
1229 const struct net_device_ops *ops = dev->netdev_ops;
1234 if (!netif_device_present(dev))
1237 /* Block netpoll from trying to do any rx path servicing.
1238 * If we don't do this there is a chance ndo_poll_controller
1239 * or ndo_poll may be running while we open the device
1241 ret = netpoll_rx_disable(dev);
1245 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1246 ret = notifier_to_errno(ret);
1250 set_bit(__LINK_STATE_START, &dev->state);
1252 if (ops->ndo_validate_addr)
1253 ret = ops->ndo_validate_addr(dev);
1255 if (!ret && ops->ndo_open)
1256 ret = ops->ndo_open(dev);
1258 netpoll_rx_enable(dev);
1261 clear_bit(__LINK_STATE_START, &dev->state);
1263 dev->flags |= IFF_UP;
1264 net_dmaengine_get();
1265 dev_set_rx_mode(dev);
1267 add_device_randomness(dev->dev_addr, dev->addr_len);
1274 * dev_open - prepare an interface for use.
1275 * @dev: device to open
1277 * Takes a device from down to up state. The device's private open
1278 * function is invoked and then the multicast lists are loaded. Finally
1279 * the device is moved into the up state and a %NETDEV_UP message is
1280 * sent to the netdev notifier chain.
1282 * Calling this function on an active interface is a nop. On a failure
1283 * a negative errno code is returned.
1285 int dev_open(struct net_device *dev)
1289 if (dev->flags & IFF_UP)
1292 ret = __dev_open(dev);
1296 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1297 call_netdevice_notifiers(NETDEV_UP, dev);
1301 EXPORT_SYMBOL(dev_open);
1303 static int __dev_close_many(struct list_head *head)
1305 struct net_device *dev;
1310 list_for_each_entry(dev, head, unreg_list) {
1311 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1313 clear_bit(__LINK_STATE_START, &dev->state);
1315 /* Synchronize to scheduled poll. We cannot touch poll list, it
1316 * can be even on different cpu. So just clear netif_running().
1318 * dev->stop() will invoke napi_disable() on all of it's
1319 * napi_struct instances on this device.
1321 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1324 dev_deactivate_many(head);
1326 list_for_each_entry(dev, head, unreg_list) {
1327 const struct net_device_ops *ops = dev->netdev_ops;
1330 * Call the device specific close. This cannot fail.
1331 * Only if device is UP
1333 * We allow it to be called even after a DETACH hot-plug
1339 dev->flags &= ~IFF_UP;
1340 net_dmaengine_put();
1346 static int __dev_close(struct net_device *dev)
1351 /* Temporarily disable netpoll until the interface is down */
1352 retval = netpoll_rx_disable(dev);
1356 list_add(&dev->unreg_list, &single);
1357 retval = __dev_close_many(&single);
1360 netpoll_rx_enable(dev);
1364 static int dev_close_many(struct list_head *head)
1366 struct net_device *dev, *tmp;
1367 LIST_HEAD(tmp_list);
1369 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1370 if (!(dev->flags & IFF_UP))
1371 list_move(&dev->unreg_list, &tmp_list);
1373 __dev_close_many(head);
1375 list_for_each_entry(dev, head, unreg_list) {
1376 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1377 call_netdevice_notifiers(NETDEV_DOWN, dev);
1380 /* rollback_registered_many needs the complete original list */
1381 list_splice(&tmp_list, head);
1386 * dev_close - shutdown an interface.
1387 * @dev: device to shutdown
1389 * This function moves an active device into down state. A
1390 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1391 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1394 int dev_close(struct net_device *dev)
1397 if (dev->flags & IFF_UP) {
1400 /* Block netpoll rx while the interface is going down */
1401 ret = netpoll_rx_disable(dev);
1405 list_add(&dev->unreg_list, &single);
1406 dev_close_many(&single);
1409 netpoll_rx_enable(dev);
1413 EXPORT_SYMBOL(dev_close);
1417 * dev_disable_lro - disable Large Receive Offload on a device
1420 * Disable Large Receive Offload (LRO) on a net device. Must be
1421 * called under RTNL. This is needed if received packets may be
1422 * forwarded to another interface.
1424 void dev_disable_lro(struct net_device *dev)
1427 * If we're trying to disable lro on a vlan device
1428 * use the underlying physical device instead
1430 if (is_vlan_dev(dev))
1431 dev = vlan_dev_real_dev(dev);
1433 dev->wanted_features &= ~NETIF_F_LRO;
1434 netdev_update_features(dev);
1436 if (unlikely(dev->features & NETIF_F_LRO))
1437 netdev_WARN(dev, "failed to disable LRO!\n");
1439 EXPORT_SYMBOL(dev_disable_lro);
1442 static int dev_boot_phase = 1;
1445 * register_netdevice_notifier - register a network notifier block
1448 * Register a notifier to be called when network device events occur.
1449 * The notifier passed is linked into the kernel structures and must
1450 * not be reused until it has been unregistered. A negative errno code
1451 * is returned on a failure.
1453 * When registered all registration and up events are replayed
1454 * to the new notifier to allow device to have a race free
1455 * view of the network device list.
1458 int register_netdevice_notifier(struct notifier_block *nb)
1460 struct net_device *dev;
1461 struct net_device *last;
1466 err = raw_notifier_chain_register(&netdev_chain, nb);
1472 for_each_netdev(net, dev) {
1473 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1474 err = notifier_to_errno(err);
1478 if (!(dev->flags & IFF_UP))
1481 nb->notifier_call(nb, NETDEV_UP, dev);
1492 for_each_netdev(net, dev) {
1496 if (dev->flags & IFF_UP) {
1497 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1498 nb->notifier_call(nb, NETDEV_DOWN, dev);
1500 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1505 raw_notifier_chain_unregister(&netdev_chain, nb);
1508 EXPORT_SYMBOL(register_netdevice_notifier);
1511 * unregister_netdevice_notifier - unregister a network notifier block
1514 * Unregister a notifier previously registered by
1515 * register_netdevice_notifier(). The notifier is unlinked into the
1516 * kernel structures and may then be reused. A negative errno code
1517 * is returned on a failure.
1519 * After unregistering unregister and down device events are synthesized
1520 * for all devices on the device list to the removed notifier to remove
1521 * the need for special case cleanup code.
1524 int unregister_netdevice_notifier(struct notifier_block *nb)
1526 struct net_device *dev;
1531 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1536 for_each_netdev(net, dev) {
1537 if (dev->flags & IFF_UP) {
1538 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1539 nb->notifier_call(nb, NETDEV_DOWN, dev);
1541 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1548 EXPORT_SYMBOL(unregister_netdevice_notifier);
1551 * call_netdevice_notifiers - call all network notifier blocks
1552 * @val: value passed unmodified to notifier function
1553 * @dev: net_device pointer passed unmodified to notifier function
1555 * Call all network notifier blocks. Parameters and return value
1556 * are as for raw_notifier_call_chain().
1559 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1562 return raw_notifier_call_chain(&netdev_chain, val, dev);
1564 EXPORT_SYMBOL(call_netdevice_notifiers);
1566 static struct static_key netstamp_needed __read_mostly;
1567 #ifdef HAVE_JUMP_LABEL
1568 /* We are not allowed to call static_key_slow_dec() from irq context
1569 * If net_disable_timestamp() is called from irq context, defer the
1570 * static_key_slow_dec() calls.
1572 static atomic_t netstamp_needed_deferred;
1575 void net_enable_timestamp(void)
1577 #ifdef HAVE_JUMP_LABEL
1578 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1582 static_key_slow_dec(&netstamp_needed);
1586 WARN_ON(in_interrupt());
1587 static_key_slow_inc(&netstamp_needed);
1589 EXPORT_SYMBOL(net_enable_timestamp);
1591 void net_disable_timestamp(void)
1593 #ifdef HAVE_JUMP_LABEL
1594 if (in_interrupt()) {
1595 atomic_inc(&netstamp_needed_deferred);
1599 static_key_slow_dec(&netstamp_needed);
1601 EXPORT_SYMBOL(net_disable_timestamp);
1603 static inline void net_timestamp_set(struct sk_buff *skb)
1605 skb->tstamp.tv64 = 0;
1606 if (static_key_false(&netstamp_needed))
1607 __net_timestamp(skb);
1610 #define net_timestamp_check(COND, SKB) \
1611 if (static_key_false(&netstamp_needed)) { \
1612 if ((COND) && !(SKB)->tstamp.tv64) \
1613 __net_timestamp(SKB); \
1616 static inline bool is_skb_forwardable(struct net_device *dev,
1617 struct sk_buff *skb)
1621 if (!(dev->flags & IFF_UP))
1624 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1625 if (skb->len <= len)
1628 /* if TSO is enabled, we don't care about the length as the packet
1629 * could be forwarded without being segmented before
1631 if (skb_is_gso(skb))
1638 * dev_forward_skb - loopback an skb to another netif
1640 * @dev: destination network device
1641 * @skb: buffer to forward
1644 * NET_RX_SUCCESS (no congestion)
1645 * NET_RX_DROP (packet was dropped, but freed)
1647 * dev_forward_skb can be used for injecting an skb from the
1648 * start_xmit function of one device into the receive queue
1649 * of another device.
1651 * The receiving device may be in another namespace, so
1652 * we have to clear all information in the skb that could
1653 * impact namespace isolation.
1655 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1657 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1658 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1659 atomic_long_inc(&dev->rx_dropped);
1668 if (unlikely(!is_skb_forwardable(dev, skb))) {
1669 atomic_long_inc(&dev->rx_dropped);
1676 skb->tstamp.tv64 = 0;
1677 skb->pkt_type = PACKET_HOST;
1678 skb->protocol = eth_type_trans(skb, dev);
1682 return netif_rx(skb);
1684 EXPORT_SYMBOL_GPL(dev_forward_skb);
1686 static inline int deliver_skb(struct sk_buff *skb,
1687 struct packet_type *pt_prev,
1688 struct net_device *orig_dev)
1690 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1692 atomic_inc(&skb->users);
1693 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1696 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1698 if (!ptype->af_packet_priv || !skb->sk)
1701 if (ptype->id_match)
1702 return ptype->id_match(ptype, skb->sk);
1703 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1710 * Support routine. Sends outgoing frames to any network
1711 * taps currently in use.
1714 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1716 struct packet_type *ptype;
1717 struct sk_buff *skb2 = NULL;
1718 struct packet_type *pt_prev = NULL;
1721 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1722 /* Never send packets back to the socket
1723 * they originated from - MvS (miquels@drinkel.ow.org)
1725 if ((ptype->dev == dev || !ptype->dev) &&
1726 (!skb_loop_sk(ptype, skb))) {
1728 deliver_skb(skb2, pt_prev, skb->dev);
1733 skb2 = skb_clone(skb, GFP_ATOMIC);
1737 net_timestamp_set(skb2);
1739 /* skb->nh should be correctly
1740 set by sender, so that the second statement is
1741 just protection against buggy protocols.
1743 skb_reset_mac_header(skb2);
1745 if (skb_network_header(skb2) < skb2->data ||
1746 skb2->network_header > skb2->tail) {
1747 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1748 ntohs(skb2->protocol),
1750 skb_reset_network_header(skb2);
1753 skb2->transport_header = skb2->network_header;
1754 skb2->pkt_type = PACKET_OUTGOING;
1759 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1764 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1765 * @dev: Network device
1766 * @txq: number of queues available
1768 * If real_num_tx_queues is changed the tc mappings may no longer be
1769 * valid. To resolve this verify the tc mapping remains valid and if
1770 * not NULL the mapping. With no priorities mapping to this
1771 * offset/count pair it will no longer be used. In the worst case TC0
1772 * is invalid nothing can be done so disable priority mappings. If is
1773 * expected that drivers will fix this mapping if they can before
1774 * calling netif_set_real_num_tx_queues.
1776 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1779 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1781 /* If TC0 is invalidated disable TC mapping */
1782 if (tc->offset + tc->count > txq) {
1783 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1788 /* Invalidated prio to tc mappings set to TC0 */
1789 for (i = 1; i < TC_BITMASK + 1; i++) {
1790 int q = netdev_get_prio_tc_map(dev, i);
1792 tc = &dev->tc_to_txq[q];
1793 if (tc->offset + tc->count > txq) {
1794 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1796 netdev_set_prio_tc_map(dev, i, 0);
1802 static DEFINE_MUTEX(xps_map_mutex);
1803 #define xmap_dereference(P) \
1804 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1806 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1809 struct xps_map *map = NULL;
1813 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1815 for (pos = 0; map && pos < map->len; pos++) {
1816 if (map->queues[pos] == index) {
1818 map->queues[pos] = map->queues[--map->len];
1820 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1821 kfree_rcu(map, rcu);
1831 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1833 struct xps_dev_maps *dev_maps;
1835 bool active = false;
1837 mutex_lock(&xps_map_mutex);
1838 dev_maps = xmap_dereference(dev->xps_maps);
1843 for_each_possible_cpu(cpu) {
1844 for (i = index; i < dev->num_tx_queues; i++) {
1845 if (!remove_xps_queue(dev_maps, cpu, i))
1848 if (i == dev->num_tx_queues)
1853 RCU_INIT_POINTER(dev->xps_maps, NULL);
1854 kfree_rcu(dev_maps, rcu);
1857 for (i = index; i < dev->num_tx_queues; i++)
1858 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1862 mutex_unlock(&xps_map_mutex);
1865 static struct xps_map *expand_xps_map(struct xps_map *map,
1868 struct xps_map *new_map;
1869 int alloc_len = XPS_MIN_MAP_ALLOC;
1872 for (pos = 0; map && pos < map->len; pos++) {
1873 if (map->queues[pos] != index)
1878 /* Need to add queue to this CPU's existing map */
1880 if (pos < map->alloc_len)
1883 alloc_len = map->alloc_len * 2;
1886 /* Need to allocate new map to store queue on this CPU's map */
1887 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1892 for (i = 0; i < pos; i++)
1893 new_map->queues[i] = map->queues[i];
1894 new_map->alloc_len = alloc_len;
1900 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1902 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1903 struct xps_map *map, *new_map;
1904 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1905 int cpu, numa_node_id = -2;
1906 bool active = false;
1908 mutex_lock(&xps_map_mutex);
1910 dev_maps = xmap_dereference(dev->xps_maps);
1912 /* allocate memory for queue storage */
1913 for_each_online_cpu(cpu) {
1914 if (!cpumask_test_cpu(cpu, mask))
1918 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1922 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1925 map = expand_xps_map(map, cpu, index);
1929 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1933 goto out_no_new_maps;
1935 for_each_possible_cpu(cpu) {
1936 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1937 /* add queue to CPU maps */
1940 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1941 while ((pos < map->len) && (map->queues[pos] != index))
1944 if (pos == map->len)
1945 map->queues[map->len++] = index;
1947 if (numa_node_id == -2)
1948 numa_node_id = cpu_to_node(cpu);
1949 else if (numa_node_id != cpu_to_node(cpu))
1952 } else if (dev_maps) {
1953 /* fill in the new device map from the old device map */
1954 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1955 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1960 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1962 /* Cleanup old maps */
1964 for_each_possible_cpu(cpu) {
1965 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1966 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1967 if (map && map != new_map)
1968 kfree_rcu(map, rcu);
1971 kfree_rcu(dev_maps, rcu);
1974 dev_maps = new_dev_maps;
1978 /* update Tx queue numa node */
1979 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1980 (numa_node_id >= 0) ? numa_node_id :
1986 /* removes queue from unused CPUs */
1987 for_each_possible_cpu(cpu) {
1988 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1991 if (remove_xps_queue(dev_maps, cpu, index))
1995 /* free map if not active */
1997 RCU_INIT_POINTER(dev->xps_maps, NULL);
1998 kfree_rcu(dev_maps, rcu);
2002 mutex_unlock(&xps_map_mutex);
2006 /* remove any maps that we added */
2007 for_each_possible_cpu(cpu) {
2008 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2009 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2011 if (new_map && new_map != map)
2015 mutex_unlock(&xps_map_mutex);
2017 kfree(new_dev_maps);
2020 EXPORT_SYMBOL(netif_set_xps_queue);
2024 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2025 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2027 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2031 if (txq < 1 || txq > dev->num_tx_queues)
2034 if (dev->reg_state == NETREG_REGISTERED ||
2035 dev->reg_state == NETREG_UNREGISTERING) {
2038 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2044 netif_setup_tc(dev, txq);
2046 if (txq < dev->real_num_tx_queues) {
2047 qdisc_reset_all_tx_gt(dev, txq);
2049 netif_reset_xps_queues_gt(dev, txq);
2054 dev->real_num_tx_queues = txq;
2057 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2061 * netif_set_real_num_rx_queues - set actual number of RX queues used
2062 * @dev: Network device
2063 * @rxq: Actual number of RX queues
2065 * This must be called either with the rtnl_lock held or before
2066 * registration of the net device. Returns 0 on success, or a
2067 * negative error code. If called before registration, it always
2070 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2074 if (rxq < 1 || rxq > dev->num_rx_queues)
2077 if (dev->reg_state == NETREG_REGISTERED) {
2080 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2086 dev->real_num_rx_queues = rxq;
2089 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2093 * netif_get_num_default_rss_queues - default number of RSS queues
2095 * This routine should set an upper limit on the number of RSS queues
2096 * used by default by multiqueue devices.
2098 int netif_get_num_default_rss_queues(void)
2100 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2102 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2104 static inline void __netif_reschedule(struct Qdisc *q)
2106 struct softnet_data *sd;
2107 unsigned long flags;
2109 local_irq_save(flags);
2110 sd = &__get_cpu_var(softnet_data);
2111 q->next_sched = NULL;
2112 *sd->output_queue_tailp = q;
2113 sd->output_queue_tailp = &q->next_sched;
2114 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2115 local_irq_restore(flags);
2118 void __netif_schedule(struct Qdisc *q)
2120 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2121 __netif_reschedule(q);
2123 EXPORT_SYMBOL(__netif_schedule);
2125 void dev_kfree_skb_irq(struct sk_buff *skb)
2127 if (atomic_dec_and_test(&skb->users)) {
2128 struct softnet_data *sd;
2129 unsigned long flags;
2131 local_irq_save(flags);
2132 sd = &__get_cpu_var(softnet_data);
2133 skb->next = sd->completion_queue;
2134 sd->completion_queue = skb;
2135 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2136 local_irq_restore(flags);
2139 EXPORT_SYMBOL(dev_kfree_skb_irq);
2141 void dev_kfree_skb_any(struct sk_buff *skb)
2143 if (in_irq() || irqs_disabled())
2144 dev_kfree_skb_irq(skb);
2148 EXPORT_SYMBOL(dev_kfree_skb_any);
2152 * netif_device_detach - mark device as removed
2153 * @dev: network device
2155 * Mark device as removed from system and therefore no longer available.
2157 void netif_device_detach(struct net_device *dev)
2159 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2160 netif_running(dev)) {
2161 netif_tx_stop_all_queues(dev);
2164 EXPORT_SYMBOL(netif_device_detach);
2167 * netif_device_attach - mark device as attached
2168 * @dev: network device
2170 * Mark device as attached from system and restart if needed.
2172 void netif_device_attach(struct net_device *dev)
2174 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2175 netif_running(dev)) {
2176 netif_tx_wake_all_queues(dev);
2177 __netdev_watchdog_up(dev);
2180 EXPORT_SYMBOL(netif_device_attach);
2182 static void skb_warn_bad_offload(const struct sk_buff *skb)
2184 static const netdev_features_t null_features = 0;
2185 struct net_device *dev = skb->dev;
2186 const char *driver = "";
2188 if (dev && dev->dev.parent)
2189 driver = dev_driver_string(dev->dev.parent);
2191 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2192 "gso_type=%d ip_summed=%d\n",
2193 driver, dev ? &dev->features : &null_features,
2194 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2195 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2196 skb_shinfo(skb)->gso_type, skb->ip_summed);
2200 * Invalidate hardware checksum when packet is to be mangled, and
2201 * complete checksum manually on outgoing path.
2203 int skb_checksum_help(struct sk_buff *skb)
2206 int ret = 0, offset;
2208 if (skb->ip_summed == CHECKSUM_COMPLETE)
2209 goto out_set_summed;
2211 if (unlikely(skb_shinfo(skb)->gso_size)) {
2212 skb_warn_bad_offload(skb);
2216 /* Before computing a checksum, we should make sure no frag could
2217 * be modified by an external entity : checksum could be wrong.
2219 if (skb_has_shared_frag(skb)) {
2220 ret = __skb_linearize(skb);
2225 offset = skb_checksum_start_offset(skb);
2226 BUG_ON(offset >= skb_headlen(skb));
2227 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2229 offset += skb->csum_offset;
2230 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2232 if (skb_cloned(skb) &&
2233 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2234 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2239 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2241 skb->ip_summed = CHECKSUM_NONE;
2245 EXPORT_SYMBOL(skb_checksum_help);
2248 * skb_mac_gso_segment - mac layer segmentation handler.
2249 * @skb: buffer to segment
2250 * @features: features for the output path (see dev->features)
2252 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2253 netdev_features_t features)
2255 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2256 struct packet_offload *ptype;
2257 __be16 type = skb->protocol;
2259 while (type == htons(ETH_P_8021Q)) {
2260 int vlan_depth = ETH_HLEN;
2261 struct vlan_hdr *vh;
2263 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2264 return ERR_PTR(-EINVAL);
2266 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2267 type = vh->h_vlan_encapsulated_proto;
2268 vlan_depth += VLAN_HLEN;
2271 __skb_pull(skb, skb->mac_len);
2274 list_for_each_entry_rcu(ptype, &offload_base, list) {
2275 if (ptype->type == type && ptype->callbacks.gso_segment) {
2276 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2279 err = ptype->callbacks.gso_send_check(skb);
2280 segs = ERR_PTR(err);
2281 if (err || skb_gso_ok(skb, features))
2283 __skb_push(skb, (skb->data -
2284 skb_network_header(skb)));
2286 segs = ptype->callbacks.gso_segment(skb, features);
2292 __skb_push(skb, skb->data - skb_mac_header(skb));
2296 EXPORT_SYMBOL(skb_mac_gso_segment);
2299 /* openvswitch calls this on rx path, so we need a different check.
2301 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2304 return skb->ip_summed != CHECKSUM_PARTIAL;
2306 return skb->ip_summed == CHECKSUM_NONE;
2310 * __skb_gso_segment - Perform segmentation on skb.
2311 * @skb: buffer to segment
2312 * @features: features for the output path (see dev->features)
2313 * @tx_path: whether it is called in TX path
2315 * This function segments the given skb and returns a list of segments.
2317 * It may return NULL if the skb requires no segmentation. This is
2318 * only possible when GSO is used for verifying header integrity.
2320 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2321 netdev_features_t features, bool tx_path)
2323 if (unlikely(skb_needs_check(skb, tx_path))) {
2326 skb_warn_bad_offload(skb);
2328 if (skb_header_cloned(skb) &&
2329 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2330 return ERR_PTR(err);
2333 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2334 skb_reset_mac_header(skb);
2335 skb_reset_mac_len(skb);
2337 return skb_mac_gso_segment(skb, features);
2339 EXPORT_SYMBOL(__skb_gso_segment);
2341 /* Take action when hardware reception checksum errors are detected. */
2343 void netdev_rx_csum_fault(struct net_device *dev)
2345 if (net_ratelimit()) {
2346 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2350 EXPORT_SYMBOL(netdev_rx_csum_fault);
2353 /* Actually, we should eliminate this check as soon as we know, that:
2354 * 1. IOMMU is present and allows to map all the memory.
2355 * 2. No high memory really exists on this machine.
2358 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2360 #ifdef CONFIG_HIGHMEM
2362 if (!(dev->features & NETIF_F_HIGHDMA)) {
2363 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2364 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2365 if (PageHighMem(skb_frag_page(frag)))
2370 if (PCI_DMA_BUS_IS_PHYS) {
2371 struct device *pdev = dev->dev.parent;
2375 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2376 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2377 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2378 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2387 void (*destructor)(struct sk_buff *skb);
2390 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2392 static void dev_gso_skb_destructor(struct sk_buff *skb)
2394 struct dev_gso_cb *cb;
2397 struct sk_buff *nskb = skb->next;
2399 skb->next = nskb->next;
2402 } while (skb->next);
2404 cb = DEV_GSO_CB(skb);
2406 cb->destructor(skb);
2410 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2411 * @skb: buffer to segment
2412 * @features: device features as applicable to this skb
2414 * This function segments the given skb and stores the list of segments
2417 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2419 struct sk_buff *segs;
2421 segs = skb_gso_segment(skb, features);
2423 /* Verifying header integrity only. */
2428 return PTR_ERR(segs);
2431 DEV_GSO_CB(skb)->destructor = skb->destructor;
2432 skb->destructor = dev_gso_skb_destructor;
2437 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2439 return ((features & NETIF_F_GEN_CSUM) ||
2440 ((features & NETIF_F_V4_CSUM) &&
2441 protocol == htons(ETH_P_IP)) ||
2442 ((features & NETIF_F_V6_CSUM) &&
2443 protocol == htons(ETH_P_IPV6)) ||
2444 ((features & NETIF_F_FCOE_CRC) &&
2445 protocol == htons(ETH_P_FCOE)));
2448 static netdev_features_t harmonize_features(struct sk_buff *skb,
2449 __be16 protocol, netdev_features_t features)
2451 if (skb->ip_summed != CHECKSUM_NONE &&
2452 !can_checksum_protocol(features, protocol)) {
2453 features &= ~NETIF_F_ALL_CSUM;
2454 features &= ~NETIF_F_SG;
2455 } else if (illegal_highdma(skb->dev, skb)) {
2456 features &= ~NETIF_F_SG;
2462 netdev_features_t netif_skb_features(struct sk_buff *skb)
2464 __be16 protocol = skb->protocol;
2465 netdev_features_t features = skb->dev->features;
2467 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2468 features &= ~NETIF_F_GSO_MASK;
2470 if (protocol == htons(ETH_P_8021Q)) {
2471 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2472 protocol = veh->h_vlan_encapsulated_proto;
2473 } else if (!vlan_tx_tag_present(skb)) {
2474 return harmonize_features(skb, protocol, features);
2477 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2479 if (protocol != htons(ETH_P_8021Q)) {
2480 return harmonize_features(skb, protocol, features);
2482 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2483 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2484 return harmonize_features(skb, protocol, features);
2487 EXPORT_SYMBOL(netif_skb_features);
2490 * Returns true if either:
2491 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2492 * 2. skb is fragmented and the device does not support SG.
2494 static inline int skb_needs_linearize(struct sk_buff *skb,
2497 return skb_is_nonlinear(skb) &&
2498 ((skb_has_frag_list(skb) &&
2499 !(features & NETIF_F_FRAGLIST)) ||
2500 (skb_shinfo(skb)->nr_frags &&
2501 !(features & NETIF_F_SG)));
2504 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2505 struct netdev_queue *txq)
2507 const struct net_device_ops *ops = dev->netdev_ops;
2508 int rc = NETDEV_TX_OK;
2509 unsigned int skb_len;
2511 if (likely(!skb->next)) {
2512 netdev_features_t features;
2515 * If device doesn't need skb->dst, release it right now while
2516 * its hot in this cpu cache
2518 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2521 features = netif_skb_features(skb);
2523 if (vlan_tx_tag_present(skb) &&
2524 !(features & NETIF_F_HW_VLAN_TX)) {
2525 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2532 /* If encapsulation offload request, verify we are testing
2533 * hardware encapsulation features instead of standard
2534 * features for the netdev
2536 if (skb->encapsulation)
2537 features &= dev->hw_enc_features;
2539 if (netif_needs_gso(skb, features)) {
2540 if (unlikely(dev_gso_segment(skb, features)))
2545 if (skb_needs_linearize(skb, features) &&
2546 __skb_linearize(skb))
2549 /* If packet is not checksummed and device does not
2550 * support checksumming for this protocol, complete
2551 * checksumming here.
2553 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2554 if (skb->encapsulation)
2555 skb_set_inner_transport_header(skb,
2556 skb_checksum_start_offset(skb));
2558 skb_set_transport_header(skb,
2559 skb_checksum_start_offset(skb));
2560 if (!(features & NETIF_F_ALL_CSUM) &&
2561 skb_checksum_help(skb))
2566 if (!list_empty(&ptype_all))
2567 dev_queue_xmit_nit(skb, dev);
2570 rc = ops->ndo_start_xmit(skb, dev);
2571 trace_net_dev_xmit(skb, rc, dev, skb_len);
2572 if (rc == NETDEV_TX_OK)
2573 txq_trans_update(txq);
2579 struct sk_buff *nskb = skb->next;
2581 skb->next = nskb->next;
2585 * If device doesn't need nskb->dst, release it right now while
2586 * its hot in this cpu cache
2588 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2591 if (!list_empty(&ptype_all))
2592 dev_queue_xmit_nit(nskb, dev);
2594 skb_len = nskb->len;
2595 rc = ops->ndo_start_xmit(nskb, dev);
2596 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2597 if (unlikely(rc != NETDEV_TX_OK)) {
2598 if (rc & ~NETDEV_TX_MASK)
2599 goto out_kfree_gso_skb;
2600 nskb->next = skb->next;
2604 txq_trans_update(txq);
2605 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2606 return NETDEV_TX_BUSY;
2607 } while (skb->next);
2610 if (likely(skb->next == NULL))
2611 skb->destructor = DEV_GSO_CB(skb)->destructor;
2618 static void qdisc_pkt_len_init(struct sk_buff *skb)
2620 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2622 qdisc_skb_cb(skb)->pkt_len = skb->len;
2624 /* To get more precise estimation of bytes sent on wire,
2625 * we add to pkt_len the headers size of all segments
2627 if (shinfo->gso_size) {
2628 unsigned int hdr_len;
2630 /* mac layer + network layer */
2631 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2633 /* + transport layer */
2634 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2635 hdr_len += tcp_hdrlen(skb);
2637 hdr_len += sizeof(struct udphdr);
2638 qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2642 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2643 struct net_device *dev,
2644 struct netdev_queue *txq)
2646 spinlock_t *root_lock = qdisc_lock(q);
2650 qdisc_pkt_len_init(skb);
2651 qdisc_calculate_pkt_len(skb, q);
2653 * Heuristic to force contended enqueues to serialize on a
2654 * separate lock before trying to get qdisc main lock.
2655 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2656 * and dequeue packets faster.
2658 contended = qdisc_is_running(q);
2659 if (unlikely(contended))
2660 spin_lock(&q->busylock);
2662 spin_lock(root_lock);
2663 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2666 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2667 qdisc_run_begin(q)) {
2669 * This is a work-conserving queue; there are no old skbs
2670 * waiting to be sent out; and the qdisc is not running -
2671 * xmit the skb directly.
2673 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2676 qdisc_bstats_update(q, skb);
2678 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2679 if (unlikely(contended)) {
2680 spin_unlock(&q->busylock);
2687 rc = NET_XMIT_SUCCESS;
2690 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2691 if (qdisc_run_begin(q)) {
2692 if (unlikely(contended)) {
2693 spin_unlock(&q->busylock);
2699 spin_unlock(root_lock);
2700 if (unlikely(contended))
2701 spin_unlock(&q->busylock);
2705 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2706 static void skb_update_prio(struct sk_buff *skb)
2708 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2710 if (!skb->priority && skb->sk && map) {
2711 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2713 if (prioidx < map->priomap_len)
2714 skb->priority = map->priomap[prioidx];
2718 #define skb_update_prio(skb)
2721 static DEFINE_PER_CPU(int, xmit_recursion);
2722 #define RECURSION_LIMIT 10
2725 * dev_loopback_xmit - loop back @skb
2726 * @skb: buffer to transmit
2728 int dev_loopback_xmit(struct sk_buff *skb)
2730 skb_reset_mac_header(skb);
2731 __skb_pull(skb, skb_network_offset(skb));
2732 skb->pkt_type = PACKET_LOOPBACK;
2733 skb->ip_summed = CHECKSUM_UNNECESSARY;
2734 WARN_ON(!skb_dst(skb));
2739 EXPORT_SYMBOL(dev_loopback_xmit);
2742 * dev_queue_xmit - transmit a buffer
2743 * @skb: buffer to transmit
2745 * Queue a buffer for transmission to a network device. The caller must
2746 * have set the device and priority and built the buffer before calling
2747 * this function. The function can be called from an interrupt.
2749 * A negative errno code is returned on a failure. A success does not
2750 * guarantee the frame will be transmitted as it may be dropped due
2751 * to congestion or traffic shaping.
2753 * -----------------------------------------------------------------------------------
2754 * I notice this method can also return errors from the queue disciplines,
2755 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2758 * Regardless of the return value, the skb is consumed, so it is currently
2759 * difficult to retry a send to this method. (You can bump the ref count
2760 * before sending to hold a reference for retry if you are careful.)
2762 * When calling this method, interrupts MUST be enabled. This is because
2763 * the BH enable code must have IRQs enabled so that it will not deadlock.
2766 int dev_queue_xmit(struct sk_buff *skb)
2768 struct net_device *dev = skb->dev;
2769 struct netdev_queue *txq;
2773 skb_reset_mac_header(skb);
2775 /* Disable soft irqs for various locks below. Also
2776 * stops preemption for RCU.
2780 skb_update_prio(skb);
2782 txq = netdev_pick_tx(dev, skb);
2783 q = rcu_dereference_bh(txq->qdisc);
2785 #ifdef CONFIG_NET_CLS_ACT
2786 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2788 trace_net_dev_queue(skb);
2790 rc = __dev_xmit_skb(skb, q, dev, txq);
2794 /* The device has no queue. Common case for software devices:
2795 loopback, all the sorts of tunnels...
2797 Really, it is unlikely that netif_tx_lock protection is necessary
2798 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2800 However, it is possible, that they rely on protection
2803 Check this and shot the lock. It is not prone from deadlocks.
2804 Either shot noqueue qdisc, it is even simpler 8)
2806 if (dev->flags & IFF_UP) {
2807 int cpu = smp_processor_id(); /* ok because BHs are off */
2809 if (txq->xmit_lock_owner != cpu) {
2811 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2812 goto recursion_alert;
2814 HARD_TX_LOCK(dev, txq, cpu);
2816 if (!netif_xmit_stopped(txq)) {
2817 __this_cpu_inc(xmit_recursion);
2818 rc = dev_hard_start_xmit(skb, dev, txq);
2819 __this_cpu_dec(xmit_recursion);
2820 if (dev_xmit_complete(rc)) {
2821 HARD_TX_UNLOCK(dev, txq);
2825 HARD_TX_UNLOCK(dev, txq);
2826 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2829 /* Recursion is detected! It is possible,
2833 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2839 rcu_read_unlock_bh();
2844 rcu_read_unlock_bh();
2847 EXPORT_SYMBOL(dev_queue_xmit);
2850 /*=======================================================================
2852 =======================================================================*/
2854 int netdev_max_backlog __read_mostly = 1000;
2855 EXPORT_SYMBOL(netdev_max_backlog);
2857 int netdev_tstamp_prequeue __read_mostly = 1;
2858 int netdev_budget __read_mostly = 300;
2859 int weight_p __read_mostly = 64; /* old backlog weight */
2861 /* Called with irq disabled */
2862 static inline void ____napi_schedule(struct softnet_data *sd,
2863 struct napi_struct *napi)
2865 list_add_tail(&napi->poll_list, &sd->poll_list);
2866 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2871 /* One global table that all flow-based protocols share. */
2872 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2873 EXPORT_SYMBOL(rps_sock_flow_table);
2875 struct static_key rps_needed __read_mostly;
2877 static struct rps_dev_flow *
2878 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2879 struct rps_dev_flow *rflow, u16 next_cpu)
2881 if (next_cpu != RPS_NO_CPU) {
2882 #ifdef CONFIG_RFS_ACCEL
2883 struct netdev_rx_queue *rxqueue;
2884 struct rps_dev_flow_table *flow_table;
2885 struct rps_dev_flow *old_rflow;
2890 /* Should we steer this flow to a different hardware queue? */
2891 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2892 !(dev->features & NETIF_F_NTUPLE))
2894 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2895 if (rxq_index == skb_get_rx_queue(skb))
2898 rxqueue = dev->_rx + rxq_index;
2899 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2902 flow_id = skb->rxhash & flow_table->mask;
2903 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2904 rxq_index, flow_id);
2908 rflow = &flow_table->flows[flow_id];
2910 if (old_rflow->filter == rflow->filter)
2911 old_rflow->filter = RPS_NO_FILTER;
2915 per_cpu(softnet_data, next_cpu).input_queue_head;
2918 rflow->cpu = next_cpu;
2923 * get_rps_cpu is called from netif_receive_skb and returns the target
2924 * CPU from the RPS map of the receiving queue for a given skb.
2925 * rcu_read_lock must be held on entry.
2927 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2928 struct rps_dev_flow **rflowp)
2930 struct netdev_rx_queue *rxqueue;
2931 struct rps_map *map;
2932 struct rps_dev_flow_table *flow_table;
2933 struct rps_sock_flow_table *sock_flow_table;
2937 if (skb_rx_queue_recorded(skb)) {
2938 u16 index = skb_get_rx_queue(skb);
2939 if (unlikely(index >= dev->real_num_rx_queues)) {
2940 WARN_ONCE(dev->real_num_rx_queues > 1,
2941 "%s received packet on queue %u, but number "
2942 "of RX queues is %u\n",
2943 dev->name, index, dev->real_num_rx_queues);
2946 rxqueue = dev->_rx + index;
2950 map = rcu_dereference(rxqueue->rps_map);
2952 if (map->len == 1 &&
2953 !rcu_access_pointer(rxqueue->rps_flow_table)) {
2954 tcpu = map->cpus[0];
2955 if (cpu_online(tcpu))
2959 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2963 skb_reset_network_header(skb);
2964 if (!skb_get_rxhash(skb))
2967 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2968 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2969 if (flow_table && sock_flow_table) {
2971 struct rps_dev_flow *rflow;
2973 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2976 next_cpu = sock_flow_table->ents[skb->rxhash &
2977 sock_flow_table->mask];
2980 * If the desired CPU (where last recvmsg was done) is
2981 * different from current CPU (one in the rx-queue flow
2982 * table entry), switch if one of the following holds:
2983 * - Current CPU is unset (equal to RPS_NO_CPU).
2984 * - Current CPU is offline.
2985 * - The current CPU's queue tail has advanced beyond the
2986 * last packet that was enqueued using this table entry.
2987 * This guarantees that all previous packets for the flow
2988 * have been dequeued, thus preserving in order delivery.
2990 if (unlikely(tcpu != next_cpu) &&
2991 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2992 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2993 rflow->last_qtail)) >= 0)) {
2995 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2998 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3006 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3008 if (cpu_online(tcpu)) {
3018 #ifdef CONFIG_RFS_ACCEL
3021 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3022 * @dev: Device on which the filter was set
3023 * @rxq_index: RX queue index
3024 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3025 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3027 * Drivers that implement ndo_rx_flow_steer() should periodically call
3028 * this function for each installed filter and remove the filters for
3029 * which it returns %true.
3031 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3032 u32 flow_id, u16 filter_id)
3034 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3035 struct rps_dev_flow_table *flow_table;
3036 struct rps_dev_flow *rflow;
3041 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3042 if (flow_table && flow_id <= flow_table->mask) {
3043 rflow = &flow_table->flows[flow_id];
3044 cpu = ACCESS_ONCE(rflow->cpu);
3045 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3046 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3047 rflow->last_qtail) <
3048 (int)(10 * flow_table->mask)))
3054 EXPORT_SYMBOL(rps_may_expire_flow);
3056 #endif /* CONFIG_RFS_ACCEL */
3058 /* Called from hardirq (IPI) context */
3059 static void rps_trigger_softirq(void *data)
3061 struct softnet_data *sd = data;
3063 ____napi_schedule(sd, &sd->backlog);
3067 #endif /* CONFIG_RPS */
3070 * Check if this softnet_data structure is another cpu one
3071 * If yes, queue it to our IPI list and return 1
3074 static int rps_ipi_queued(struct softnet_data *sd)
3077 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3080 sd->rps_ipi_next = mysd->rps_ipi_list;
3081 mysd->rps_ipi_list = sd;
3083 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3086 #endif /* CONFIG_RPS */
3091 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3092 * queue (may be a remote CPU queue).
3094 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3095 unsigned int *qtail)
3097 struct softnet_data *sd;
3098 unsigned long flags;
3100 sd = &per_cpu(softnet_data, cpu);
3102 local_irq_save(flags);
3105 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3106 if (skb_queue_len(&sd->input_pkt_queue)) {
3108 __skb_queue_tail(&sd->input_pkt_queue, skb);
3109 input_queue_tail_incr_save(sd, qtail);
3111 local_irq_restore(flags);
3112 return NET_RX_SUCCESS;
3115 /* Schedule NAPI for backlog device
3116 * We can use non atomic operation since we own the queue lock
3118 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3119 if (!rps_ipi_queued(sd))
3120 ____napi_schedule(sd, &sd->backlog);
3128 local_irq_restore(flags);
3130 atomic_long_inc(&skb->dev->rx_dropped);
3136 * netif_rx - post buffer to the network code
3137 * @skb: buffer to post
3139 * This function receives a packet from a device driver and queues it for
3140 * the upper (protocol) levels to process. It always succeeds. The buffer
3141 * may be dropped during processing for congestion control or by the
3145 * NET_RX_SUCCESS (no congestion)
3146 * NET_RX_DROP (packet was dropped)
3150 int netif_rx(struct sk_buff *skb)
3154 /* if netpoll wants it, pretend we never saw it */
3155 if (netpoll_rx(skb))
3158 net_timestamp_check(netdev_tstamp_prequeue, skb);
3160 trace_netif_rx(skb);
3162 if (static_key_false(&rps_needed)) {
3163 struct rps_dev_flow voidflow, *rflow = &voidflow;
3169 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3171 cpu = smp_processor_id();
3173 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3181 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3186 EXPORT_SYMBOL(netif_rx);
3188 int netif_rx_ni(struct sk_buff *skb)
3193 err = netif_rx(skb);
3194 if (local_softirq_pending())
3200 EXPORT_SYMBOL(netif_rx_ni);
3202 static void net_tx_action(struct softirq_action *h)
3204 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3206 if (sd->completion_queue) {
3207 struct sk_buff *clist;
3209 local_irq_disable();
3210 clist = sd->completion_queue;
3211 sd->completion_queue = NULL;
3215 struct sk_buff *skb = clist;
3216 clist = clist->next;
3218 WARN_ON(atomic_read(&skb->users));
3219 trace_kfree_skb(skb, net_tx_action);
3224 if (sd->output_queue) {
3227 local_irq_disable();
3228 head = sd->output_queue;
3229 sd->output_queue = NULL;
3230 sd->output_queue_tailp = &sd->output_queue;
3234 struct Qdisc *q = head;
3235 spinlock_t *root_lock;
3237 head = head->next_sched;
3239 root_lock = qdisc_lock(q);
3240 if (spin_trylock(root_lock)) {
3241 smp_mb__before_clear_bit();
3242 clear_bit(__QDISC_STATE_SCHED,
3245 spin_unlock(root_lock);
3247 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3249 __netif_reschedule(q);
3251 smp_mb__before_clear_bit();
3252 clear_bit(__QDISC_STATE_SCHED,
3260 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3261 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3262 /* This hook is defined here for ATM LANE */
3263 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3264 unsigned char *addr) __read_mostly;
3265 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3268 #ifdef CONFIG_NET_CLS_ACT
3269 /* TODO: Maybe we should just force sch_ingress to be compiled in
3270 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3271 * a compare and 2 stores extra right now if we dont have it on
3272 * but have CONFIG_NET_CLS_ACT
3273 * NOTE: This doesn't stop any functionality; if you dont have
3274 * the ingress scheduler, you just can't add policies on ingress.
3277 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3279 struct net_device *dev = skb->dev;
3280 u32 ttl = G_TC_RTTL(skb->tc_verd);
3281 int result = TC_ACT_OK;
3284 if (unlikely(MAX_RED_LOOP < ttl++)) {
3285 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3286 skb->skb_iif, dev->ifindex);
3290 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3291 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3294 if (q != &noop_qdisc) {
3295 spin_lock(qdisc_lock(q));
3296 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3297 result = qdisc_enqueue_root(skb, q);
3298 spin_unlock(qdisc_lock(q));
3304 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3305 struct packet_type **pt_prev,
3306 int *ret, struct net_device *orig_dev)
3308 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3310 if (!rxq || rxq->qdisc == &noop_qdisc)
3314 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3318 switch (ing_filter(skb, rxq)) {
3332 * netdev_rx_handler_register - register receive handler
3333 * @dev: device to register a handler for
3334 * @rx_handler: receive handler to register
3335 * @rx_handler_data: data pointer that is used by rx handler
3337 * Register a receive hander for a device. This handler will then be
3338 * called from __netif_receive_skb. A negative errno code is returned
3341 * The caller must hold the rtnl_mutex.
3343 * For a general description of rx_handler, see enum rx_handler_result.
3345 int netdev_rx_handler_register(struct net_device *dev,
3346 rx_handler_func_t *rx_handler,
3347 void *rx_handler_data)
3351 if (dev->rx_handler)
3354 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3355 rcu_assign_pointer(dev->rx_handler, rx_handler);
3359 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3362 * netdev_rx_handler_unregister - unregister receive handler
3363 * @dev: device to unregister a handler from
3365 * Unregister a receive hander from a device.
3367 * The caller must hold the rtnl_mutex.
3369 void netdev_rx_handler_unregister(struct net_device *dev)
3373 RCU_INIT_POINTER(dev->rx_handler, NULL);
3374 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3376 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3379 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3380 * the special handling of PFMEMALLOC skbs.
3382 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3384 switch (skb->protocol) {
3385 case __constant_htons(ETH_P_ARP):
3386 case __constant_htons(ETH_P_IP):
3387 case __constant_htons(ETH_P_IPV6):
3388 case __constant_htons(ETH_P_8021Q):
3395 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3397 struct packet_type *ptype, *pt_prev;
3398 rx_handler_func_t *rx_handler;
3399 struct net_device *orig_dev;
3400 struct net_device *null_or_dev;
3401 bool deliver_exact = false;
3402 int ret = NET_RX_DROP;
3405 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3407 trace_netif_receive_skb(skb);
3409 /* if we've gotten here through NAPI, check netpoll */
3410 if (netpoll_receive_skb(skb))
3413 orig_dev = skb->dev;
3415 skb_reset_network_header(skb);
3416 if (!skb_transport_header_was_set(skb))
3417 skb_reset_transport_header(skb);
3418 skb_reset_mac_len(skb);
3425 skb->skb_iif = skb->dev->ifindex;
3427 __this_cpu_inc(softnet_data.processed);
3429 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3430 skb = vlan_untag(skb);
3435 #ifdef CONFIG_NET_CLS_ACT
3436 if (skb->tc_verd & TC_NCLS) {
3437 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3445 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3446 if (!ptype->dev || ptype->dev == skb->dev) {
3448 ret = deliver_skb(skb, pt_prev, orig_dev);
3454 #ifdef CONFIG_NET_CLS_ACT
3455 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3461 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3464 if (vlan_tx_tag_present(skb)) {
3466 ret = deliver_skb(skb, pt_prev, orig_dev);
3469 if (vlan_do_receive(&skb))
3471 else if (unlikely(!skb))
3475 rx_handler = rcu_dereference(skb->dev->rx_handler);
3478 ret = deliver_skb(skb, pt_prev, orig_dev);
3481 switch (rx_handler(&skb)) {
3482 case RX_HANDLER_CONSUMED:
3484 case RX_HANDLER_ANOTHER:
3486 case RX_HANDLER_EXACT:
3487 deliver_exact = true;
3488 case RX_HANDLER_PASS:
3495 if (vlan_tx_nonzero_tag_present(skb))
3496 skb->pkt_type = PACKET_OTHERHOST;
3498 /* deliver only exact match when indicated */
3499 null_or_dev = deliver_exact ? skb->dev : NULL;
3501 type = skb->protocol;
3502 list_for_each_entry_rcu(ptype,
3503 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3504 if (ptype->type == type &&
3505 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3506 ptype->dev == orig_dev)) {
3508 ret = deliver_skb(skb, pt_prev, orig_dev);
3514 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3517 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3520 atomic_long_inc(&skb->dev->rx_dropped);
3522 /* Jamal, now you will not able to escape explaining
3523 * me how you were going to use this. :-)
3534 static int __netif_receive_skb(struct sk_buff *skb)
3538 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3539 unsigned long pflags = current->flags;
3542 * PFMEMALLOC skbs are special, they should
3543 * - be delivered to SOCK_MEMALLOC sockets only
3544 * - stay away from userspace
3545 * - have bounded memory usage
3547 * Use PF_MEMALLOC as this saves us from propagating the allocation
3548 * context down to all allocation sites.
3550 current->flags |= PF_MEMALLOC;
3551 ret = __netif_receive_skb_core(skb, true);
3552 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3554 ret = __netif_receive_skb_core(skb, false);
3560 * netif_receive_skb - process receive buffer from network
3561 * @skb: buffer to process
3563 * netif_receive_skb() is the main receive data processing function.
3564 * It always succeeds. The buffer may be dropped during processing
3565 * for congestion control or by the protocol layers.
3567 * This function may only be called from softirq context and interrupts
3568 * should be enabled.
3570 * Return values (usually ignored):
3571 * NET_RX_SUCCESS: no congestion
3572 * NET_RX_DROP: packet was dropped
3574 int netif_receive_skb(struct sk_buff *skb)
3576 net_timestamp_check(netdev_tstamp_prequeue, skb);
3578 if (skb_defer_rx_timestamp(skb))
3579 return NET_RX_SUCCESS;
3582 if (static_key_false(&rps_needed)) {
3583 struct rps_dev_flow voidflow, *rflow = &voidflow;
3588 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3591 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3598 return __netif_receive_skb(skb);
3600 EXPORT_SYMBOL(netif_receive_skb);
3602 /* Network device is going away, flush any packets still pending
3603 * Called with irqs disabled.
3605 static void flush_backlog(void *arg)
3607 struct net_device *dev = arg;
3608 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3609 struct sk_buff *skb, *tmp;
3612 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3613 if (skb->dev == dev) {
3614 __skb_unlink(skb, &sd->input_pkt_queue);
3616 input_queue_head_incr(sd);
3621 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3622 if (skb->dev == dev) {
3623 __skb_unlink(skb, &sd->process_queue);
3625 input_queue_head_incr(sd);
3630 static int napi_gro_complete(struct sk_buff *skb)
3632 struct packet_offload *ptype;
3633 __be16 type = skb->protocol;
3634 struct list_head *head = &offload_base;
3637 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3639 if (NAPI_GRO_CB(skb)->count == 1) {
3640 skb_shinfo(skb)->gso_size = 0;
3645 list_for_each_entry_rcu(ptype, head, list) {
3646 if (ptype->type != type || !ptype->callbacks.gro_complete)
3649 err = ptype->callbacks.gro_complete(skb);
3655 WARN_ON(&ptype->list == head);
3657 return NET_RX_SUCCESS;
3661 return netif_receive_skb(skb);
3664 /* napi->gro_list contains packets ordered by age.
3665 * youngest packets at the head of it.
3666 * Complete skbs in reverse order to reduce latencies.
3668 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3670 struct sk_buff *skb, *prev = NULL;
3672 /* scan list and build reverse chain */
3673 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3678 for (skb = prev; skb; skb = prev) {
3681 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3685 napi_gro_complete(skb);
3689 napi->gro_list = NULL;
3691 EXPORT_SYMBOL(napi_gro_flush);
3693 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3696 unsigned int maclen = skb->dev->hard_header_len;
3698 for (p = napi->gro_list; p; p = p->next) {
3699 unsigned long diffs;
3701 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3702 diffs |= p->vlan_tci ^ skb->vlan_tci;
3703 if (maclen == ETH_HLEN)
3704 diffs |= compare_ether_header(skb_mac_header(p),
3705 skb_gro_mac_header(skb));
3707 diffs = memcmp(skb_mac_header(p),
3708 skb_gro_mac_header(skb),
3710 NAPI_GRO_CB(p)->same_flow = !diffs;
3711 NAPI_GRO_CB(p)->flush = 0;
3715 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3717 struct sk_buff **pp = NULL;
3718 struct packet_offload *ptype;
3719 __be16 type = skb->protocol;
3720 struct list_head *head = &offload_base;
3722 enum gro_result ret;
3724 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3727 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3730 gro_list_prepare(napi, skb);
3733 list_for_each_entry_rcu(ptype, head, list) {
3734 if (ptype->type != type || !ptype->callbacks.gro_receive)
3737 skb_set_network_header(skb, skb_gro_offset(skb));
3738 skb_reset_mac_len(skb);
3739 NAPI_GRO_CB(skb)->same_flow = 0;
3740 NAPI_GRO_CB(skb)->flush = 0;
3741 NAPI_GRO_CB(skb)->free = 0;
3743 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3748 if (&ptype->list == head)
3751 same_flow = NAPI_GRO_CB(skb)->same_flow;
3752 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3755 struct sk_buff *nskb = *pp;
3759 napi_gro_complete(nskb);
3766 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3770 NAPI_GRO_CB(skb)->count = 1;
3771 NAPI_GRO_CB(skb)->age = jiffies;
3772 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3773 skb->next = napi->gro_list;
3774 napi->gro_list = skb;
3778 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3779 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3781 BUG_ON(skb->end - skb->tail < grow);
3783 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3786 skb->data_len -= grow;
3788 skb_shinfo(skb)->frags[0].page_offset += grow;
3789 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3791 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3792 skb_frag_unref(skb, 0);
3793 memmove(skb_shinfo(skb)->frags,
3794 skb_shinfo(skb)->frags + 1,
3795 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3808 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3812 if (netif_receive_skb(skb))
3820 case GRO_MERGED_FREE:
3821 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3822 kmem_cache_free(skbuff_head_cache, skb);
3835 static void skb_gro_reset_offset(struct sk_buff *skb)
3837 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3838 const skb_frag_t *frag0 = &pinfo->frags[0];
3840 NAPI_GRO_CB(skb)->data_offset = 0;
3841 NAPI_GRO_CB(skb)->frag0 = NULL;
3842 NAPI_GRO_CB(skb)->frag0_len = 0;
3844 if (skb->mac_header == skb->tail &&
3846 !PageHighMem(skb_frag_page(frag0))) {
3847 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3848 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3852 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3854 skb_gro_reset_offset(skb);
3856 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3858 EXPORT_SYMBOL(napi_gro_receive);
3860 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3862 __skb_pull(skb, skb_headlen(skb));
3863 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3864 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3866 skb->dev = napi->dev;
3872 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3874 struct sk_buff *skb = napi->skb;
3877 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3883 EXPORT_SYMBOL(napi_get_frags);
3885 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3891 skb->protocol = eth_type_trans(skb, skb->dev);
3893 if (ret == GRO_HELD)
3894 skb_gro_pull(skb, -ETH_HLEN);
3895 else if (netif_receive_skb(skb))
3900 case GRO_MERGED_FREE:
3901 napi_reuse_skb(napi, skb);
3911 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3913 struct sk_buff *skb = napi->skb;
3920 skb_reset_mac_header(skb);
3921 skb_gro_reset_offset(skb);
3923 off = skb_gro_offset(skb);
3924 hlen = off + sizeof(*eth);
3925 eth = skb_gro_header_fast(skb, off);
3926 if (skb_gro_header_hard(skb, hlen)) {
3927 eth = skb_gro_header_slow(skb, hlen, off);
3928 if (unlikely(!eth)) {
3929 napi_reuse_skb(napi, skb);
3935 skb_gro_pull(skb, sizeof(*eth));
3938 * This works because the only protocols we care about don't require
3939 * special handling. We'll fix it up properly at the end.
3941 skb->protocol = eth->h_proto;
3947 gro_result_t napi_gro_frags(struct napi_struct *napi)
3949 struct sk_buff *skb = napi_frags_skb(napi);
3954 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
3956 EXPORT_SYMBOL(napi_gro_frags);
3959 * net_rps_action sends any pending IPI's for rps.
3960 * Note: called with local irq disabled, but exits with local irq enabled.
3962 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3965 struct softnet_data *remsd = sd->rps_ipi_list;
3968 sd->rps_ipi_list = NULL;
3972 /* Send pending IPI's to kick RPS processing on remote cpus. */
3974 struct softnet_data *next = remsd->rps_ipi_next;
3976 if (cpu_online(remsd->cpu))
3977 __smp_call_function_single(remsd->cpu,
3986 static int process_backlog(struct napi_struct *napi, int quota)
3989 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3992 /* Check if we have pending ipi, its better to send them now,
3993 * not waiting net_rx_action() end.
3995 if (sd->rps_ipi_list) {
3996 local_irq_disable();
3997 net_rps_action_and_irq_enable(sd);
4000 napi->weight = weight_p;
4001 local_irq_disable();
4002 while (work < quota) {
4003 struct sk_buff *skb;
4006 while ((skb = __skb_dequeue(&sd->process_queue))) {
4008 __netif_receive_skb(skb);
4009 local_irq_disable();
4010 input_queue_head_incr(sd);
4011 if (++work >= quota) {
4018 qlen = skb_queue_len(&sd->input_pkt_queue);
4020 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4021 &sd->process_queue);
4023 if (qlen < quota - work) {
4025 * Inline a custom version of __napi_complete().
4026 * only current cpu owns and manipulates this napi,
4027 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4028 * we can use a plain write instead of clear_bit(),
4029 * and we dont need an smp_mb() memory barrier.
4031 list_del(&napi->poll_list);
4034 quota = work + qlen;
4044 * __napi_schedule - schedule for receive
4045 * @n: entry to schedule
4047 * The entry's receive function will be scheduled to run
4049 void __napi_schedule(struct napi_struct *n)
4051 unsigned long flags;
4053 local_irq_save(flags);
4054 ____napi_schedule(&__get_cpu_var(softnet_data), n);
4055 local_irq_restore(flags);
4057 EXPORT_SYMBOL(__napi_schedule);
4059 void __napi_complete(struct napi_struct *n)
4061 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4062 BUG_ON(n->gro_list);
4064 list_del(&n->poll_list);
4065 smp_mb__before_clear_bit();
4066 clear_bit(NAPI_STATE_SCHED, &n->state);
4068 EXPORT_SYMBOL(__napi_complete);
4070 void napi_complete(struct napi_struct *n)
4072 unsigned long flags;
4075 * don't let napi dequeue from the cpu poll list
4076 * just in case its running on a different cpu
4078 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4081 napi_gro_flush(n, false);
4082 local_irq_save(flags);
4084 local_irq_restore(flags);
4086 EXPORT_SYMBOL(napi_complete);
4088 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4089 int (*poll)(struct napi_struct *, int), int weight)
4091 INIT_LIST_HEAD(&napi->poll_list);
4092 napi->gro_count = 0;
4093 napi->gro_list = NULL;
4096 napi->weight = weight;
4097 list_add(&napi->dev_list, &dev->napi_list);
4099 #ifdef CONFIG_NETPOLL
4100 spin_lock_init(&napi->poll_lock);
4101 napi->poll_owner = -1;
4103 set_bit(NAPI_STATE_SCHED, &napi->state);
4105 EXPORT_SYMBOL(netif_napi_add);
4107 void netif_napi_del(struct napi_struct *napi)
4109 struct sk_buff *skb, *next;
4111 list_del_init(&napi->dev_list);
4112 napi_free_frags(napi);
4114 for (skb = napi->gro_list; skb; skb = next) {
4120 napi->gro_list = NULL;
4121 napi->gro_count = 0;
4123 EXPORT_SYMBOL(netif_napi_del);
4125 static void net_rx_action(struct softirq_action *h)
4127 struct softnet_data *sd = &__get_cpu_var(softnet_data);
4128 unsigned long time_limit = jiffies + 2;
4129 int budget = netdev_budget;
4132 local_irq_disable();
4134 while (!list_empty(&sd->poll_list)) {
4135 struct napi_struct *n;
4138 /* If softirq window is exhuasted then punt.
4139 * Allow this to run for 2 jiffies since which will allow
4140 * an average latency of 1.5/HZ.
4142 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4147 /* Even though interrupts have been re-enabled, this
4148 * access is safe because interrupts can only add new
4149 * entries to the tail of this list, and only ->poll()
4150 * calls can remove this head entry from the list.
4152 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4154 have = netpoll_poll_lock(n);
4158 /* This NAPI_STATE_SCHED test is for avoiding a race
4159 * with netpoll's poll_napi(). Only the entity which
4160 * obtains the lock and sees NAPI_STATE_SCHED set will
4161 * actually make the ->poll() call. Therefore we avoid
4162 * accidentally calling ->poll() when NAPI is not scheduled.
4165 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4166 work = n->poll(n, weight);
4170 WARN_ON_ONCE(work > weight);
4174 local_irq_disable();
4176 /* Drivers must not modify the NAPI state if they
4177 * consume the entire weight. In such cases this code
4178 * still "owns" the NAPI instance and therefore can
4179 * move the instance around on the list at-will.
4181 if (unlikely(work == weight)) {
4182 if (unlikely(napi_disable_pending(n))) {
4185 local_irq_disable();
4188 /* flush too old packets
4189 * If HZ < 1000, flush all packets.
4192 napi_gro_flush(n, HZ >= 1000);
4193 local_irq_disable();
4195 list_move_tail(&n->poll_list, &sd->poll_list);
4199 netpoll_poll_unlock(have);
4202 net_rps_action_and_irq_enable(sd);
4204 #ifdef CONFIG_NET_DMA
4206 * There may not be any more sk_buffs coming right now, so push
4207 * any pending DMA copies to hardware
4209 dma_issue_pending_all();
4216 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4220 #ifdef CONFIG_PROC_FS
4222 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4224 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4225 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4226 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4228 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4230 struct net *net = seq_file_net(seq);
4231 struct net_device *dev;
4232 struct hlist_node *p;
4233 struct hlist_head *h;
4234 unsigned int count = 0, offset = get_offset(*pos);
4236 h = &net->dev_name_head[get_bucket(*pos)];
4237 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4238 if (++count == offset)
4245 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4247 struct net_device *dev;
4248 unsigned int bucket;
4251 dev = dev_from_same_bucket(seq, pos);
4255 bucket = get_bucket(*pos) + 1;
4256 *pos = set_bucket_offset(bucket, 1);
4257 } while (bucket < NETDEV_HASHENTRIES);
4263 * This is invoked by the /proc filesystem handler to display a device
4266 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4271 return SEQ_START_TOKEN;
4273 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4276 return dev_from_bucket(seq, pos);
4279 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4282 return dev_from_bucket(seq, pos);
4285 void dev_seq_stop(struct seq_file *seq, void *v)
4291 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4293 struct rtnl_link_stats64 temp;
4294 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4296 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4297 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4298 dev->name, stats->rx_bytes, stats->rx_packets,
4300 stats->rx_dropped + stats->rx_missed_errors,
4301 stats->rx_fifo_errors,
4302 stats->rx_length_errors + stats->rx_over_errors +
4303 stats->rx_crc_errors + stats->rx_frame_errors,
4304 stats->rx_compressed, stats->multicast,
4305 stats->tx_bytes, stats->tx_packets,
4306 stats->tx_errors, stats->tx_dropped,
4307 stats->tx_fifo_errors, stats->collisions,
4308 stats->tx_carrier_errors +
4309 stats->tx_aborted_errors +
4310 stats->tx_window_errors +
4311 stats->tx_heartbeat_errors,
4312 stats->tx_compressed);
4316 * Called from the PROCfs module. This now uses the new arbitrary sized
4317 * /proc/net interface to create /proc/net/dev
4319 static int dev_seq_show(struct seq_file *seq, void *v)
4321 if (v == SEQ_START_TOKEN)
4322 seq_puts(seq, "Inter-| Receive "
4324 " face |bytes packets errs drop fifo frame "
4325 "compressed multicast|bytes packets errs "
4326 "drop fifo colls carrier compressed\n");
4328 dev_seq_printf_stats(seq, v);
4332 static struct softnet_data *softnet_get_online(loff_t *pos)
4334 struct softnet_data *sd = NULL;
4336 while (*pos < nr_cpu_ids)
4337 if (cpu_online(*pos)) {
4338 sd = &per_cpu(softnet_data, *pos);
4345 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4347 return softnet_get_online(pos);
4350 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4353 return softnet_get_online(pos);
4356 static void softnet_seq_stop(struct seq_file *seq, void *v)
4360 static int softnet_seq_show(struct seq_file *seq, void *v)
4362 struct softnet_data *sd = v;
4364 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4365 sd->processed, sd->dropped, sd->time_squeeze, 0,
4366 0, 0, 0, 0, /* was fastroute */
4367 sd->cpu_collision, sd->received_rps);
4371 static const struct seq_operations dev_seq_ops = {
4372 .start = dev_seq_start,
4373 .next = dev_seq_next,
4374 .stop = dev_seq_stop,
4375 .show = dev_seq_show,
4378 static int dev_seq_open(struct inode *inode, struct file *file)
4380 return seq_open_net(inode, file, &dev_seq_ops,
4381 sizeof(struct seq_net_private));
4384 static const struct file_operations dev_seq_fops = {
4385 .owner = THIS_MODULE,
4386 .open = dev_seq_open,
4388 .llseek = seq_lseek,
4389 .release = seq_release_net,
4392 static const struct seq_operations softnet_seq_ops = {
4393 .start = softnet_seq_start,
4394 .next = softnet_seq_next,
4395 .stop = softnet_seq_stop,
4396 .show = softnet_seq_show,
4399 static int softnet_seq_open(struct inode *inode, struct file *file)
4401 return seq_open(file, &softnet_seq_ops);
4404 static const struct file_operations softnet_seq_fops = {
4405 .owner = THIS_MODULE,
4406 .open = softnet_seq_open,
4408 .llseek = seq_lseek,
4409 .release = seq_release,
4412 static void *ptype_get_idx(loff_t pos)
4414 struct packet_type *pt = NULL;
4418 list_for_each_entry_rcu(pt, &ptype_all, list) {
4424 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4425 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4434 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4438 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4441 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4443 struct packet_type *pt;
4444 struct list_head *nxt;
4448 if (v == SEQ_START_TOKEN)
4449 return ptype_get_idx(0);
4452 nxt = pt->list.next;
4453 if (pt->type == htons(ETH_P_ALL)) {
4454 if (nxt != &ptype_all)
4457 nxt = ptype_base[0].next;
4459 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4461 while (nxt == &ptype_base[hash]) {
4462 if (++hash >= PTYPE_HASH_SIZE)
4464 nxt = ptype_base[hash].next;
4467 return list_entry(nxt, struct packet_type, list);
4470 static void ptype_seq_stop(struct seq_file *seq, void *v)
4476 static int ptype_seq_show(struct seq_file *seq, void *v)
4478 struct packet_type *pt = v;
4480 if (v == SEQ_START_TOKEN)
4481 seq_puts(seq, "Type Device Function\n");
4482 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4483 if (pt->type == htons(ETH_P_ALL))
4484 seq_puts(seq, "ALL ");
4486 seq_printf(seq, "%04x", ntohs(pt->type));
4488 seq_printf(seq, " %-8s %pF\n",
4489 pt->dev ? pt->dev->name : "", pt->func);
4495 static const struct seq_operations ptype_seq_ops = {
4496 .start = ptype_seq_start,
4497 .next = ptype_seq_next,
4498 .stop = ptype_seq_stop,
4499 .show = ptype_seq_show,
4502 static int ptype_seq_open(struct inode *inode, struct file *file)
4504 return seq_open_net(inode, file, &ptype_seq_ops,
4505 sizeof(struct seq_net_private));
4508 static const struct file_operations ptype_seq_fops = {
4509 .owner = THIS_MODULE,
4510 .open = ptype_seq_open,
4512 .llseek = seq_lseek,
4513 .release = seq_release_net,
4517 static int __net_init dev_proc_net_init(struct net *net)
4521 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4523 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4525 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4528 if (wext_proc_init(net))
4534 proc_net_remove(net, "ptype");
4536 proc_net_remove(net, "softnet_stat");
4538 proc_net_remove(net, "dev");
4542 static void __net_exit dev_proc_net_exit(struct net *net)
4544 wext_proc_exit(net);
4546 proc_net_remove(net, "ptype");
4547 proc_net_remove(net, "softnet_stat");
4548 proc_net_remove(net, "dev");
4551 static struct pernet_operations __net_initdata dev_proc_ops = {
4552 .init = dev_proc_net_init,
4553 .exit = dev_proc_net_exit,
4556 static int __init dev_proc_init(void)
4558 return register_pernet_subsys(&dev_proc_ops);
4561 #define dev_proc_init() 0
4562 #endif /* CONFIG_PROC_FS */
4565 struct netdev_upper {
4566 struct net_device *dev;
4568 struct list_head list;
4569 struct rcu_head rcu;
4570 struct list_head search_list;
4573 static void __append_search_uppers(struct list_head *search_list,
4574 struct net_device *dev)
4576 struct netdev_upper *upper;
4578 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4579 /* check if this upper is not already in search list */
4580 if (list_empty(&upper->search_list))
4581 list_add_tail(&upper->search_list, search_list);
4585 static bool __netdev_search_upper_dev(struct net_device *dev,
4586 struct net_device *upper_dev)
4588 LIST_HEAD(search_list);
4589 struct netdev_upper *upper;
4590 struct netdev_upper *tmp;
4593 __append_search_uppers(&search_list, dev);
4594 list_for_each_entry(upper, &search_list, search_list) {
4595 if (upper->dev == upper_dev) {
4599 __append_search_uppers(&search_list, upper->dev);
4601 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4602 INIT_LIST_HEAD(&upper->search_list);
4606 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4607 struct net_device *upper_dev)
4609 struct netdev_upper *upper;
4611 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4612 if (upper->dev == upper_dev)
4619 * netdev_has_upper_dev - Check if device is linked to an upper device
4621 * @upper_dev: upper device to check
4623 * Find out if a device is linked to specified upper device and return true
4624 * in case it is. Note that this checks only immediate upper device,
4625 * not through a complete stack of devices. The caller must hold the RTNL lock.
4627 bool netdev_has_upper_dev(struct net_device *dev,
4628 struct net_device *upper_dev)
4632 return __netdev_find_upper(dev, upper_dev);
4634 EXPORT_SYMBOL(netdev_has_upper_dev);
4637 * netdev_has_any_upper_dev - Check if device is linked to some device
4640 * Find out if a device is linked to an upper device and return true in case
4641 * it is. The caller must hold the RTNL lock.
4643 bool netdev_has_any_upper_dev(struct net_device *dev)
4647 return !list_empty(&dev->upper_dev_list);
4649 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4652 * netdev_master_upper_dev_get - Get master upper device
4655 * Find a master upper device and return pointer to it or NULL in case
4656 * it's not there. The caller must hold the RTNL lock.
4658 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4660 struct netdev_upper *upper;
4664 if (list_empty(&dev->upper_dev_list))
4667 upper = list_first_entry(&dev->upper_dev_list,
4668 struct netdev_upper, list);
4669 if (likely(upper->master))
4673 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4676 * netdev_master_upper_dev_get_rcu - Get master upper device
4679 * Find a master upper device and return pointer to it or NULL in case
4680 * it's not there. The caller must hold the RCU read lock.
4682 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4684 struct netdev_upper *upper;
4686 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4687 struct netdev_upper, list);
4688 if (upper && likely(upper->master))
4692 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4694 static int __netdev_upper_dev_link(struct net_device *dev,
4695 struct net_device *upper_dev, bool master)
4697 struct netdev_upper *upper;
4701 if (dev == upper_dev)
4704 /* To prevent loops, check if dev is not upper device to upper_dev. */
4705 if (__netdev_search_upper_dev(upper_dev, dev))
4708 if (__netdev_find_upper(dev, upper_dev))
4711 if (master && netdev_master_upper_dev_get(dev))
4714 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4718 upper->dev = upper_dev;
4719 upper->master = master;
4720 INIT_LIST_HEAD(&upper->search_list);
4722 /* Ensure that master upper link is always the first item in list. */
4724 list_add_rcu(&upper->list, &dev->upper_dev_list);
4726 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4727 dev_hold(upper_dev);
4733 * netdev_upper_dev_link - Add a link to the upper device
4735 * @upper_dev: new upper device
4737 * Adds a link to device which is upper to this one. The caller must hold
4738 * the RTNL lock. On a failure a negative errno code is returned.
4739 * On success the reference counts are adjusted and the function
4742 int netdev_upper_dev_link(struct net_device *dev,
4743 struct net_device *upper_dev)
4745 return __netdev_upper_dev_link(dev, upper_dev, false);
4747 EXPORT_SYMBOL(netdev_upper_dev_link);
4750 * netdev_master_upper_dev_link - Add a master link to the upper device
4752 * @upper_dev: new upper device
4754 * Adds a link to device which is upper to this one. In this case, only
4755 * one master upper device can be linked, although other non-master devices
4756 * might be linked as well. The caller must hold the RTNL lock.
4757 * On a failure a negative errno code is returned. On success the reference
4758 * counts are adjusted and the function returns zero.
4760 int netdev_master_upper_dev_link(struct net_device *dev,
4761 struct net_device *upper_dev)
4763 return __netdev_upper_dev_link(dev, upper_dev, true);
4765 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4768 * netdev_upper_dev_unlink - Removes a link to upper device
4770 * @upper_dev: new upper device
4772 * Removes a link to device which is upper to this one. The caller must hold
4775 void netdev_upper_dev_unlink(struct net_device *dev,
4776 struct net_device *upper_dev)
4778 struct netdev_upper *upper;
4782 upper = __netdev_find_upper(dev, upper_dev);
4785 list_del_rcu(&upper->list);
4787 kfree_rcu(upper, rcu);
4789 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4791 static void dev_change_rx_flags(struct net_device *dev, int flags)
4793 const struct net_device_ops *ops = dev->netdev_ops;
4795 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4796 ops->ndo_change_rx_flags(dev, flags);
4799 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4801 unsigned int old_flags = dev->flags;
4807 dev->flags |= IFF_PROMISC;
4808 dev->promiscuity += inc;
4809 if (dev->promiscuity == 0) {
4812 * If inc causes overflow, untouch promisc and return error.
4815 dev->flags &= ~IFF_PROMISC;
4817 dev->promiscuity -= inc;
4818 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4823 if (dev->flags != old_flags) {
4824 pr_info("device %s %s promiscuous mode\n",
4826 dev->flags & IFF_PROMISC ? "entered" : "left");
4827 if (audit_enabled) {
4828 current_uid_gid(&uid, &gid);
4829 audit_log(current->audit_context, GFP_ATOMIC,
4830 AUDIT_ANOM_PROMISCUOUS,
4831 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4832 dev->name, (dev->flags & IFF_PROMISC),
4833 (old_flags & IFF_PROMISC),
4834 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4835 from_kuid(&init_user_ns, uid),
4836 from_kgid(&init_user_ns, gid),
4837 audit_get_sessionid(current));
4840 dev_change_rx_flags(dev, IFF_PROMISC);
4846 * dev_set_promiscuity - update promiscuity count on a device
4850 * Add or remove promiscuity from a device. While the count in the device
4851 * remains above zero the interface remains promiscuous. Once it hits zero
4852 * the device reverts back to normal filtering operation. A negative inc
4853 * value is used to drop promiscuity on the device.
4854 * Return 0 if successful or a negative errno code on error.
4856 int dev_set_promiscuity(struct net_device *dev, int inc)
4858 unsigned int old_flags = dev->flags;
4861 err = __dev_set_promiscuity(dev, inc);
4864 if (dev->flags != old_flags)
4865 dev_set_rx_mode(dev);
4868 EXPORT_SYMBOL(dev_set_promiscuity);
4871 * dev_set_allmulti - update allmulti count on a device
4875 * Add or remove reception of all multicast frames to a device. While the
4876 * count in the device remains above zero the interface remains listening
4877 * to all interfaces. Once it hits zero the device reverts back to normal
4878 * filtering operation. A negative @inc value is used to drop the counter
4879 * when releasing a resource needing all multicasts.
4880 * Return 0 if successful or a negative errno code on error.
4883 int dev_set_allmulti(struct net_device *dev, int inc)
4885 unsigned int old_flags = dev->flags;
4889 dev->flags |= IFF_ALLMULTI;
4890 dev->allmulti += inc;
4891 if (dev->allmulti == 0) {
4894 * If inc causes overflow, untouch allmulti and return error.
4897 dev->flags &= ~IFF_ALLMULTI;
4899 dev->allmulti -= inc;
4900 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4905 if (dev->flags ^ old_flags) {
4906 dev_change_rx_flags(dev, IFF_ALLMULTI);
4907 dev_set_rx_mode(dev);
4911 EXPORT_SYMBOL(dev_set_allmulti);
4914 * Upload unicast and multicast address lists to device and
4915 * configure RX filtering. When the device doesn't support unicast
4916 * filtering it is put in promiscuous mode while unicast addresses
4919 void __dev_set_rx_mode(struct net_device *dev)
4921 const struct net_device_ops *ops = dev->netdev_ops;
4923 /* dev_open will call this function so the list will stay sane. */
4924 if (!(dev->flags&IFF_UP))
4927 if (!netif_device_present(dev))
4930 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4931 /* Unicast addresses changes may only happen under the rtnl,
4932 * therefore calling __dev_set_promiscuity here is safe.
4934 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4935 __dev_set_promiscuity(dev, 1);
4936 dev->uc_promisc = true;
4937 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4938 __dev_set_promiscuity(dev, -1);
4939 dev->uc_promisc = false;
4943 if (ops->ndo_set_rx_mode)
4944 ops->ndo_set_rx_mode(dev);
4947 void dev_set_rx_mode(struct net_device *dev)
4949 netif_addr_lock_bh(dev);
4950 __dev_set_rx_mode(dev);
4951 netif_addr_unlock_bh(dev);
4955 * dev_get_flags - get flags reported to userspace
4958 * Get the combination of flag bits exported through APIs to userspace.
4960 unsigned int dev_get_flags(const struct net_device *dev)
4964 flags = (dev->flags & ~(IFF_PROMISC |
4969 (dev->gflags & (IFF_PROMISC |
4972 if (netif_running(dev)) {
4973 if (netif_oper_up(dev))
4974 flags |= IFF_RUNNING;
4975 if (netif_carrier_ok(dev))
4976 flags |= IFF_LOWER_UP;
4977 if (netif_dormant(dev))
4978 flags |= IFF_DORMANT;
4983 EXPORT_SYMBOL(dev_get_flags);
4985 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4987 unsigned int old_flags = dev->flags;
4993 * Set the flags on our device.
4996 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4997 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4999 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5003 * Load in the correct multicast list now the flags have changed.
5006 if ((old_flags ^ flags) & IFF_MULTICAST)
5007 dev_change_rx_flags(dev, IFF_MULTICAST);
5009 dev_set_rx_mode(dev);
5012 * Have we downed the interface. We handle IFF_UP ourselves
5013 * according to user attempts to set it, rather than blindly
5018 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
5019 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5022 dev_set_rx_mode(dev);
5025 if ((flags ^ dev->gflags) & IFF_PROMISC) {
5026 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5028 dev->gflags ^= IFF_PROMISC;
5029 dev_set_promiscuity(dev, inc);
5032 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5033 is important. Some (broken) drivers set IFF_PROMISC, when
5034 IFF_ALLMULTI is requested not asking us and not reporting.
5036 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5037 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5039 dev->gflags ^= IFF_ALLMULTI;
5040 dev_set_allmulti(dev, inc);
5046 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
5048 unsigned int changes = dev->flags ^ old_flags;
5050 if (changes & IFF_UP) {
5051 if (dev->flags & IFF_UP)
5052 call_netdevice_notifiers(NETDEV_UP, dev);
5054 call_netdevice_notifiers(NETDEV_DOWN, dev);
5057 if (dev->flags & IFF_UP &&
5058 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
5059 call_netdevice_notifiers(NETDEV_CHANGE, dev);
5063 * dev_change_flags - change device settings
5065 * @flags: device state flags
5067 * Change settings on device based state flags. The flags are
5068 * in the userspace exported format.
5070 int dev_change_flags(struct net_device *dev, unsigned int flags)
5073 unsigned int changes, old_flags = dev->flags;
5075 ret = __dev_change_flags(dev, flags);
5079 changes = old_flags ^ dev->flags;
5081 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
5083 __dev_notify_flags(dev, old_flags);
5086 EXPORT_SYMBOL(dev_change_flags);
5089 * dev_set_mtu - Change maximum transfer unit
5091 * @new_mtu: new transfer unit
5093 * Change the maximum transfer size of the network device.
5095 int dev_set_mtu(struct net_device *dev, int new_mtu)
5097 const struct net_device_ops *ops = dev->netdev_ops;
5100 if (new_mtu == dev->mtu)
5103 /* MTU must be positive. */
5107 if (!netif_device_present(dev))
5111 if (ops->ndo_change_mtu)
5112 err = ops->ndo_change_mtu(dev, new_mtu);
5117 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5120 EXPORT_SYMBOL(dev_set_mtu);
5123 * dev_set_group - Change group this device belongs to
5125 * @new_group: group this device should belong to
5127 void dev_set_group(struct net_device *dev, int new_group)
5129 dev->group = new_group;
5131 EXPORT_SYMBOL(dev_set_group);
5134 * dev_set_mac_address - Change Media Access Control Address
5138 * Change the hardware (MAC) address of the device
5140 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5142 const struct net_device_ops *ops = dev->netdev_ops;
5145 if (!ops->ndo_set_mac_address)
5147 if (sa->sa_family != dev->type)
5149 if (!netif_device_present(dev))
5151 err = ops->ndo_set_mac_address(dev, sa);
5154 dev->addr_assign_type = NET_ADDR_SET;
5155 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5156 add_device_randomness(dev->dev_addr, dev->addr_len);
5159 EXPORT_SYMBOL(dev_set_mac_address);
5162 * dev_change_carrier - Change device carrier
5164 * @new_carries: new value
5166 * Change device carrier
5168 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5170 const struct net_device_ops *ops = dev->netdev_ops;
5172 if (!ops->ndo_change_carrier)
5174 if (!netif_device_present(dev))
5176 return ops->ndo_change_carrier(dev, new_carrier);
5178 EXPORT_SYMBOL(dev_change_carrier);
5181 * dev_new_index - allocate an ifindex
5182 * @net: the applicable net namespace
5184 * Returns a suitable unique value for a new device interface
5185 * number. The caller must hold the rtnl semaphore or the
5186 * dev_base_lock to be sure it remains unique.
5188 static int dev_new_index(struct net *net)
5190 int ifindex = net->ifindex;
5194 if (!__dev_get_by_index(net, ifindex))
5195 return net->ifindex = ifindex;
5199 /* Delayed registration/unregisteration */
5200 static LIST_HEAD(net_todo_list);
5202 static void net_set_todo(struct net_device *dev)
5204 list_add_tail(&dev->todo_list, &net_todo_list);
5207 static void rollback_registered_many(struct list_head *head)
5209 struct net_device *dev, *tmp;
5211 BUG_ON(dev_boot_phase);
5214 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5215 /* Some devices call without registering
5216 * for initialization unwind. Remove those
5217 * devices and proceed with the remaining.
5219 if (dev->reg_state == NETREG_UNINITIALIZED) {
5220 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5224 list_del(&dev->unreg_list);
5227 dev->dismantle = true;
5228 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5231 /* If device is running, close it first. */
5232 dev_close_many(head);
5234 list_for_each_entry(dev, head, unreg_list) {
5235 /* And unlink it from device chain. */
5236 unlist_netdevice(dev);
5238 dev->reg_state = NETREG_UNREGISTERING;
5243 list_for_each_entry(dev, head, unreg_list) {
5244 /* Shutdown queueing discipline. */
5248 /* Notify protocols, that we are about to destroy
5249 this device. They should clean all the things.
5251 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5253 if (!dev->rtnl_link_ops ||
5254 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5255 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5258 * Flush the unicast and multicast chains
5263 if (dev->netdev_ops->ndo_uninit)
5264 dev->netdev_ops->ndo_uninit(dev);
5266 /* Notifier chain MUST detach us all upper devices. */
5267 WARN_ON(netdev_has_any_upper_dev(dev));
5269 /* Remove entries from kobject tree */
5270 netdev_unregister_kobject(dev);
5272 /* Remove XPS queueing entries */
5273 netif_reset_xps_queues_gt(dev, 0);
5279 list_for_each_entry(dev, head, unreg_list)
5283 static void rollback_registered(struct net_device *dev)
5287 list_add(&dev->unreg_list, &single);
5288 rollback_registered_many(&single);
5292 static netdev_features_t netdev_fix_features(struct net_device *dev,
5293 netdev_features_t features)
5295 /* Fix illegal checksum combinations */
5296 if ((features & NETIF_F_HW_CSUM) &&
5297 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5298 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5299 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5302 /* Fix illegal SG+CSUM combinations. */
5303 if ((features & NETIF_F_SG) &&
5304 !(features & NETIF_F_ALL_CSUM)) {
5306 "Dropping NETIF_F_SG since no checksum feature.\n");
5307 features &= ~NETIF_F_SG;
5310 /* TSO requires that SG is present as well. */
5311 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5312 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5313 features &= ~NETIF_F_ALL_TSO;
5316 /* TSO ECN requires that TSO is present as well. */
5317 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5318 features &= ~NETIF_F_TSO_ECN;
5320 /* Software GSO depends on SG. */
5321 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5322 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5323 features &= ~NETIF_F_GSO;
5326 /* UFO needs SG and checksumming */
5327 if (features & NETIF_F_UFO) {
5328 /* maybe split UFO into V4 and V6? */
5329 if (!((features & NETIF_F_GEN_CSUM) ||
5330 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5331 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5333 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5334 features &= ~NETIF_F_UFO;
5337 if (!(features & NETIF_F_SG)) {
5339 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5340 features &= ~NETIF_F_UFO;
5347 int __netdev_update_features(struct net_device *dev)
5349 netdev_features_t features;
5354 features = netdev_get_wanted_features(dev);
5356 if (dev->netdev_ops->ndo_fix_features)
5357 features = dev->netdev_ops->ndo_fix_features(dev, features);
5359 /* driver might be less strict about feature dependencies */
5360 features = netdev_fix_features(dev, features);
5362 if (dev->features == features)
5365 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5366 &dev->features, &features);
5368 if (dev->netdev_ops->ndo_set_features)
5369 err = dev->netdev_ops->ndo_set_features(dev, features);
5371 if (unlikely(err < 0)) {
5373 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5374 err, &features, &dev->features);
5379 dev->features = features;
5385 * netdev_update_features - recalculate device features
5386 * @dev: the device to check
5388 * Recalculate dev->features set and send notifications if it
5389 * has changed. Should be called after driver or hardware dependent
5390 * conditions might have changed that influence the features.
5392 void netdev_update_features(struct net_device *dev)
5394 if (__netdev_update_features(dev))
5395 netdev_features_change(dev);
5397 EXPORT_SYMBOL(netdev_update_features);
5400 * netdev_change_features - recalculate device features
5401 * @dev: the device to check
5403 * Recalculate dev->features set and send notifications even
5404 * if they have not changed. Should be called instead of
5405 * netdev_update_features() if also dev->vlan_features might
5406 * have changed to allow the changes to be propagated to stacked
5409 void netdev_change_features(struct net_device *dev)
5411 __netdev_update_features(dev);
5412 netdev_features_change(dev);
5414 EXPORT_SYMBOL(netdev_change_features);
5417 * netif_stacked_transfer_operstate - transfer operstate
5418 * @rootdev: the root or lower level device to transfer state from
5419 * @dev: the device to transfer operstate to
5421 * Transfer operational state from root to device. This is normally
5422 * called when a stacking relationship exists between the root
5423 * device and the device(a leaf device).
5425 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5426 struct net_device *dev)
5428 if (rootdev->operstate == IF_OPER_DORMANT)
5429 netif_dormant_on(dev);
5431 netif_dormant_off(dev);
5433 if (netif_carrier_ok(rootdev)) {
5434 if (!netif_carrier_ok(dev))
5435 netif_carrier_on(dev);
5437 if (netif_carrier_ok(dev))
5438 netif_carrier_off(dev);
5441 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5444 static int netif_alloc_rx_queues(struct net_device *dev)
5446 unsigned int i, count = dev->num_rx_queues;
5447 struct netdev_rx_queue *rx;
5451 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5457 for (i = 0; i < count; i++)
5463 static void netdev_init_one_queue(struct net_device *dev,
5464 struct netdev_queue *queue, void *_unused)
5466 /* Initialize queue lock */
5467 spin_lock_init(&queue->_xmit_lock);
5468 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5469 queue->xmit_lock_owner = -1;
5470 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5473 dql_init(&queue->dql, HZ);
5477 static int netif_alloc_netdev_queues(struct net_device *dev)
5479 unsigned int count = dev->num_tx_queues;
5480 struct netdev_queue *tx;
5484 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5490 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5491 spin_lock_init(&dev->tx_global_lock);
5497 * register_netdevice - register a network device
5498 * @dev: device to register
5500 * Take a completed network device structure and add it to the kernel
5501 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5502 * chain. 0 is returned on success. A negative errno code is returned
5503 * on a failure to set up the device, or if the name is a duplicate.
5505 * Callers must hold the rtnl semaphore. You may want
5506 * register_netdev() instead of this.
5509 * The locking appears insufficient to guarantee two parallel registers
5510 * will not get the same name.
5513 int register_netdevice(struct net_device *dev)
5516 struct net *net = dev_net(dev);
5518 BUG_ON(dev_boot_phase);
5523 /* When net_device's are persistent, this will be fatal. */
5524 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5527 spin_lock_init(&dev->addr_list_lock);
5528 netdev_set_addr_lockdep_class(dev);
5532 ret = dev_get_valid_name(net, dev, dev->name);
5536 /* Init, if this function is available */
5537 if (dev->netdev_ops->ndo_init) {
5538 ret = dev->netdev_ops->ndo_init(dev);
5546 if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
5547 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5548 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5549 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5556 dev->ifindex = dev_new_index(net);
5557 else if (__dev_get_by_index(net, dev->ifindex))
5560 if (dev->iflink == -1)
5561 dev->iflink = dev->ifindex;
5563 /* Transfer changeable features to wanted_features and enable
5564 * software offloads (GSO and GRO).
5566 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5567 dev->features |= NETIF_F_SOFT_FEATURES;
5568 dev->wanted_features = dev->features & dev->hw_features;
5570 /* Turn on no cache copy if HW is doing checksum */
5571 if (!(dev->flags & IFF_LOOPBACK)) {
5572 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5573 if (dev->features & NETIF_F_ALL_CSUM) {
5574 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5575 dev->features |= NETIF_F_NOCACHE_COPY;
5579 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5581 dev->vlan_features |= NETIF_F_HIGHDMA;
5583 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5584 ret = notifier_to_errno(ret);
5588 ret = netdev_register_kobject(dev);
5591 dev->reg_state = NETREG_REGISTERED;
5593 __netdev_update_features(dev);
5596 * Default initial state at registry is that the
5597 * device is present.
5600 set_bit(__LINK_STATE_PRESENT, &dev->state);
5602 linkwatch_init_dev(dev);
5604 dev_init_scheduler(dev);
5606 list_netdevice(dev);
5607 add_device_randomness(dev->dev_addr, dev->addr_len);
5609 /* If the device has permanent device address, driver should
5610 * set dev_addr and also addr_assign_type should be set to
5611 * NET_ADDR_PERM (default value).
5613 if (dev->addr_assign_type == NET_ADDR_PERM)
5614 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5616 /* Notify protocols, that a new device appeared. */
5617 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5618 ret = notifier_to_errno(ret);
5620 rollback_registered(dev);
5621 dev->reg_state = NETREG_UNREGISTERED;
5624 * Prevent userspace races by waiting until the network
5625 * device is fully setup before sending notifications.
5627 if (!dev->rtnl_link_ops ||
5628 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5629 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5635 if (dev->netdev_ops->ndo_uninit)
5636 dev->netdev_ops->ndo_uninit(dev);
5639 EXPORT_SYMBOL(register_netdevice);
5642 * init_dummy_netdev - init a dummy network device for NAPI
5643 * @dev: device to init
5645 * This takes a network device structure and initialize the minimum
5646 * amount of fields so it can be used to schedule NAPI polls without
5647 * registering a full blown interface. This is to be used by drivers
5648 * that need to tie several hardware interfaces to a single NAPI
5649 * poll scheduler due to HW limitations.
5651 int init_dummy_netdev(struct net_device *dev)
5653 /* Clear everything. Note we don't initialize spinlocks
5654 * are they aren't supposed to be taken by any of the
5655 * NAPI code and this dummy netdev is supposed to be
5656 * only ever used for NAPI polls
5658 memset(dev, 0, sizeof(struct net_device));
5660 /* make sure we BUG if trying to hit standard
5661 * register/unregister code path
5663 dev->reg_state = NETREG_DUMMY;
5665 /* NAPI wants this */
5666 INIT_LIST_HEAD(&dev->napi_list);
5668 /* a dummy interface is started by default */
5669 set_bit(__LINK_STATE_PRESENT, &dev->state);
5670 set_bit(__LINK_STATE_START, &dev->state);
5672 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5673 * because users of this 'device' dont need to change
5679 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5683 * register_netdev - register a network device
5684 * @dev: device to register
5686 * Take a completed network device structure and add it to the kernel
5687 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5688 * chain. 0 is returned on success. A negative errno code is returned
5689 * on a failure to set up the device, or if the name is a duplicate.
5691 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5692 * and expands the device name if you passed a format string to
5695 int register_netdev(struct net_device *dev)
5700 err = register_netdevice(dev);
5704 EXPORT_SYMBOL(register_netdev);
5706 int netdev_refcnt_read(const struct net_device *dev)
5710 for_each_possible_cpu(i)
5711 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5714 EXPORT_SYMBOL(netdev_refcnt_read);
5717 * netdev_wait_allrefs - wait until all references are gone.
5718 * @dev: target net_device
5720 * This is called when unregistering network devices.
5722 * Any protocol or device that holds a reference should register
5723 * for netdevice notification, and cleanup and put back the
5724 * reference if they receive an UNREGISTER event.
5725 * We can get stuck here if buggy protocols don't correctly
5728 static void netdev_wait_allrefs(struct net_device *dev)
5730 unsigned long rebroadcast_time, warning_time;
5733 linkwatch_forget_dev(dev);
5735 rebroadcast_time = warning_time = jiffies;
5736 refcnt = netdev_refcnt_read(dev);
5738 while (refcnt != 0) {
5739 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5742 /* Rebroadcast unregister notification */
5743 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5749 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5750 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5752 /* We must not have linkwatch events
5753 * pending on unregister. If this
5754 * happens, we simply run the queue
5755 * unscheduled, resulting in a noop
5758 linkwatch_run_queue();
5763 rebroadcast_time = jiffies;
5768 refcnt = netdev_refcnt_read(dev);
5770 if (time_after(jiffies, warning_time + 10 * HZ)) {
5771 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5773 warning_time = jiffies;
5782 * register_netdevice(x1);
5783 * register_netdevice(x2);
5785 * unregister_netdevice(y1);
5786 * unregister_netdevice(y2);
5792 * We are invoked by rtnl_unlock().
5793 * This allows us to deal with problems:
5794 * 1) We can delete sysfs objects which invoke hotplug
5795 * without deadlocking with linkwatch via keventd.
5796 * 2) Since we run with the RTNL semaphore not held, we can sleep
5797 * safely in order to wait for the netdev refcnt to drop to zero.
5799 * We must not return until all unregister events added during
5800 * the interval the lock was held have been completed.
5802 void netdev_run_todo(void)
5804 struct list_head list;
5806 /* Snapshot list, allow later requests */
5807 list_replace_init(&net_todo_list, &list);
5812 /* Wait for rcu callbacks to finish before next phase */
5813 if (!list_empty(&list))
5816 while (!list_empty(&list)) {
5817 struct net_device *dev
5818 = list_first_entry(&list, struct net_device, todo_list);
5819 list_del(&dev->todo_list);
5822 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5825 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5826 pr_err("network todo '%s' but state %d\n",
5827 dev->name, dev->reg_state);
5832 dev->reg_state = NETREG_UNREGISTERED;
5834 on_each_cpu(flush_backlog, dev, 1);
5836 netdev_wait_allrefs(dev);
5839 BUG_ON(netdev_refcnt_read(dev));
5840 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5841 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5842 WARN_ON(dev->dn_ptr);
5844 if (dev->destructor)
5845 dev->destructor(dev);
5847 /* Free network device */
5848 kobject_put(&dev->dev.kobj);
5852 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5853 * fields in the same order, with only the type differing.
5855 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5856 const struct net_device_stats *netdev_stats)
5858 #if BITS_PER_LONG == 64
5859 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5860 memcpy(stats64, netdev_stats, sizeof(*stats64));
5862 size_t i, n = sizeof(*stats64) / sizeof(u64);
5863 const unsigned long *src = (const unsigned long *)netdev_stats;
5864 u64 *dst = (u64 *)stats64;
5866 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5867 sizeof(*stats64) / sizeof(u64));
5868 for (i = 0; i < n; i++)
5872 EXPORT_SYMBOL(netdev_stats_to_stats64);
5875 * dev_get_stats - get network device statistics
5876 * @dev: device to get statistics from
5877 * @storage: place to store stats
5879 * Get network statistics from device. Return @storage.
5880 * The device driver may provide its own method by setting
5881 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5882 * otherwise the internal statistics structure is used.
5884 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5885 struct rtnl_link_stats64 *storage)
5887 const struct net_device_ops *ops = dev->netdev_ops;
5889 if (ops->ndo_get_stats64) {
5890 memset(storage, 0, sizeof(*storage));
5891 ops->ndo_get_stats64(dev, storage);
5892 } else if (ops->ndo_get_stats) {
5893 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5895 netdev_stats_to_stats64(storage, &dev->stats);
5897 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5900 EXPORT_SYMBOL(dev_get_stats);
5902 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5904 struct netdev_queue *queue = dev_ingress_queue(dev);
5906 #ifdef CONFIG_NET_CLS_ACT
5909 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5912 netdev_init_one_queue(dev, queue, NULL);
5913 queue->qdisc = &noop_qdisc;
5914 queue->qdisc_sleeping = &noop_qdisc;
5915 rcu_assign_pointer(dev->ingress_queue, queue);
5920 static const struct ethtool_ops default_ethtool_ops;
5922 void netdev_set_default_ethtool_ops(struct net_device *dev,
5923 const struct ethtool_ops *ops)
5925 if (dev->ethtool_ops == &default_ethtool_ops)
5926 dev->ethtool_ops = ops;
5928 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5931 * alloc_netdev_mqs - allocate network device
5932 * @sizeof_priv: size of private data to allocate space for
5933 * @name: device name format string
5934 * @setup: callback to initialize device
5935 * @txqs: the number of TX subqueues to allocate
5936 * @rxqs: the number of RX subqueues to allocate
5938 * Allocates a struct net_device with private data area for driver use
5939 * and performs basic initialization. Also allocates subquue structs
5940 * for each queue on the device.
5942 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5943 void (*setup)(struct net_device *),
5944 unsigned int txqs, unsigned int rxqs)
5946 struct net_device *dev;
5948 struct net_device *p;
5950 BUG_ON(strlen(name) >= sizeof(dev->name));
5953 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5959 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5964 alloc_size = sizeof(struct net_device);
5966 /* ensure 32-byte alignment of private area */
5967 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5968 alloc_size += sizeof_priv;
5970 /* ensure 32-byte alignment of whole construct */
5971 alloc_size += NETDEV_ALIGN - 1;
5973 p = kzalloc(alloc_size, GFP_KERNEL);
5977 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5978 dev->padded = (char *)dev - (char *)p;
5980 dev->pcpu_refcnt = alloc_percpu(int);
5981 if (!dev->pcpu_refcnt)
5984 if (dev_addr_init(dev))
5990 dev_net_set(dev, &init_net);
5992 dev->gso_max_size = GSO_MAX_SIZE;
5993 dev->gso_max_segs = GSO_MAX_SEGS;
5995 INIT_LIST_HEAD(&dev->napi_list);
5996 INIT_LIST_HEAD(&dev->unreg_list);
5997 INIT_LIST_HEAD(&dev->link_watch_list);
5998 INIT_LIST_HEAD(&dev->upper_dev_list);
5999 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6002 dev->num_tx_queues = txqs;
6003 dev->real_num_tx_queues = txqs;
6004 if (netif_alloc_netdev_queues(dev))
6008 dev->num_rx_queues = rxqs;
6009 dev->real_num_rx_queues = rxqs;
6010 if (netif_alloc_rx_queues(dev))
6014 strcpy(dev->name, name);
6015 dev->group = INIT_NETDEV_GROUP;
6016 if (!dev->ethtool_ops)
6017 dev->ethtool_ops = &default_ethtool_ops;
6025 free_percpu(dev->pcpu_refcnt);
6035 EXPORT_SYMBOL(alloc_netdev_mqs);
6038 * free_netdev - free network device
6041 * This function does the last stage of destroying an allocated device
6042 * interface. The reference to the device object is released.
6043 * If this is the last reference then it will be freed.
6045 void free_netdev(struct net_device *dev)
6047 struct napi_struct *p, *n;
6049 release_net(dev_net(dev));
6056 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6058 /* Flush device addresses */
6059 dev_addr_flush(dev);
6061 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6064 free_percpu(dev->pcpu_refcnt);
6065 dev->pcpu_refcnt = NULL;
6067 /* Compatibility with error handling in drivers */
6068 if (dev->reg_state == NETREG_UNINITIALIZED) {
6069 kfree((char *)dev - dev->padded);
6073 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6074 dev->reg_state = NETREG_RELEASED;
6076 /* will free via device release */
6077 put_device(&dev->dev);
6079 EXPORT_SYMBOL(free_netdev);
6082 * synchronize_net - Synchronize with packet receive processing
6084 * Wait for packets currently being received to be done.
6085 * Does not block later packets from starting.
6087 void synchronize_net(void)
6090 if (rtnl_is_locked())
6091 synchronize_rcu_expedited();
6095 EXPORT_SYMBOL(synchronize_net);
6098 * unregister_netdevice_queue - remove device from the kernel
6102 * This function shuts down a device interface and removes it
6103 * from the kernel tables.
6104 * If head not NULL, device is queued to be unregistered later.
6106 * Callers must hold the rtnl semaphore. You may want
6107 * unregister_netdev() instead of this.
6110 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6115 list_move_tail(&dev->unreg_list, head);
6117 rollback_registered(dev);
6118 /* Finish processing unregister after unlock */
6122 EXPORT_SYMBOL(unregister_netdevice_queue);
6125 * unregister_netdevice_many - unregister many devices
6126 * @head: list of devices
6128 void unregister_netdevice_many(struct list_head *head)
6130 struct net_device *dev;
6132 if (!list_empty(head)) {
6133 rollback_registered_many(head);
6134 list_for_each_entry(dev, head, unreg_list)
6138 EXPORT_SYMBOL(unregister_netdevice_many);
6141 * unregister_netdev - remove device from the kernel
6144 * This function shuts down a device interface and removes it
6145 * from the kernel tables.
6147 * This is just a wrapper for unregister_netdevice that takes
6148 * the rtnl semaphore. In general you want to use this and not
6149 * unregister_netdevice.
6151 void unregister_netdev(struct net_device *dev)
6154 unregister_netdevice(dev);
6157 EXPORT_SYMBOL(unregister_netdev);
6160 * dev_change_net_namespace - move device to different nethost namespace
6162 * @net: network namespace
6163 * @pat: If not NULL name pattern to try if the current device name
6164 * is already taken in the destination network namespace.
6166 * This function shuts down a device interface and moves it
6167 * to a new network namespace. On success 0 is returned, on
6168 * a failure a netagive errno code is returned.
6170 * Callers must hold the rtnl semaphore.
6173 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6179 /* Don't allow namespace local devices to be moved. */
6181 if (dev->features & NETIF_F_NETNS_LOCAL)
6184 /* Ensure the device has been registrered */
6185 if (dev->reg_state != NETREG_REGISTERED)
6188 /* Get out if there is nothing todo */
6190 if (net_eq(dev_net(dev), net))
6193 /* Pick the destination device name, and ensure
6194 * we can use it in the destination network namespace.
6197 if (__dev_get_by_name(net, dev->name)) {
6198 /* We get here if we can't use the current device name */
6201 if (dev_get_valid_name(net, dev, pat) < 0)
6206 * And now a mini version of register_netdevice unregister_netdevice.
6209 /* If device is running close it first. */
6212 /* And unlink it from device chain */
6214 unlist_netdevice(dev);
6218 /* Shutdown queueing discipline. */
6221 /* Notify protocols, that we are about to destroy
6222 this device. They should clean all the things.
6224 Note that dev->reg_state stays at NETREG_REGISTERED.
6225 This is wanted because this way 8021q and macvlan know
6226 the device is just moving and can keep their slaves up.
6228 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6230 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6231 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6234 * Flush the unicast and multicast chains
6239 /* Send a netdev-removed uevent to the old namespace */
6240 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6242 /* Actually switch the network namespace */
6243 dev_net_set(dev, net);
6245 /* If there is an ifindex conflict assign a new one */
6246 if (__dev_get_by_index(net, dev->ifindex)) {
6247 int iflink = (dev->iflink == dev->ifindex);
6248 dev->ifindex = dev_new_index(net);
6250 dev->iflink = dev->ifindex;
6253 /* Send a netdev-add uevent to the new namespace */
6254 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6256 /* Fixup kobjects */
6257 err = device_rename(&dev->dev, dev->name);
6260 /* Add the device back in the hashes */
6261 list_netdevice(dev);
6263 /* Notify protocols, that a new device appeared. */
6264 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6267 * Prevent userspace races by waiting until the network
6268 * device is fully setup before sending notifications.
6270 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6277 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6279 static int dev_cpu_callback(struct notifier_block *nfb,
6280 unsigned long action,
6283 struct sk_buff **list_skb;
6284 struct sk_buff *skb;
6285 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6286 struct softnet_data *sd, *oldsd;
6288 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6291 local_irq_disable();
6292 cpu = smp_processor_id();
6293 sd = &per_cpu(softnet_data, cpu);
6294 oldsd = &per_cpu(softnet_data, oldcpu);
6296 /* Find end of our completion_queue. */
6297 list_skb = &sd->completion_queue;
6299 list_skb = &(*list_skb)->next;
6300 /* Append completion queue from offline CPU. */
6301 *list_skb = oldsd->completion_queue;
6302 oldsd->completion_queue = NULL;
6304 /* Append output queue from offline CPU. */
6305 if (oldsd->output_queue) {
6306 *sd->output_queue_tailp = oldsd->output_queue;
6307 sd->output_queue_tailp = oldsd->output_queue_tailp;
6308 oldsd->output_queue = NULL;
6309 oldsd->output_queue_tailp = &oldsd->output_queue;
6311 /* Append NAPI poll list from offline CPU. */
6312 if (!list_empty(&oldsd->poll_list)) {
6313 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6314 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6317 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6320 /* Process offline CPU's input_pkt_queue */
6321 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6323 input_queue_head_incr(oldsd);
6325 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6327 input_queue_head_incr(oldsd);
6335 * netdev_increment_features - increment feature set by one
6336 * @all: current feature set
6337 * @one: new feature set
6338 * @mask: mask feature set
6340 * Computes a new feature set after adding a device with feature set
6341 * @one to the master device with current feature set @all. Will not
6342 * enable anything that is off in @mask. Returns the new feature set.
6344 netdev_features_t netdev_increment_features(netdev_features_t all,
6345 netdev_features_t one, netdev_features_t mask)
6347 if (mask & NETIF_F_GEN_CSUM)
6348 mask |= NETIF_F_ALL_CSUM;
6349 mask |= NETIF_F_VLAN_CHALLENGED;
6351 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6352 all &= one | ~NETIF_F_ALL_FOR_ALL;
6354 /* If one device supports hw checksumming, set for all. */
6355 if (all & NETIF_F_GEN_CSUM)
6356 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6360 EXPORT_SYMBOL(netdev_increment_features);
6362 static struct hlist_head *netdev_create_hash(void)
6365 struct hlist_head *hash;
6367 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6369 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6370 INIT_HLIST_HEAD(&hash[i]);
6375 /* Initialize per network namespace state */
6376 static int __net_init netdev_init(struct net *net)
6378 if (net != &init_net)
6379 INIT_LIST_HEAD(&net->dev_base_head);
6381 net->dev_name_head = netdev_create_hash();
6382 if (net->dev_name_head == NULL)
6385 net->dev_index_head = netdev_create_hash();
6386 if (net->dev_index_head == NULL)
6392 kfree(net->dev_name_head);
6398 * netdev_drivername - network driver for the device
6399 * @dev: network device
6401 * Determine network driver for device.
6403 const char *netdev_drivername(const struct net_device *dev)
6405 const struct device_driver *driver;
6406 const struct device *parent;
6407 const char *empty = "";
6409 parent = dev->dev.parent;
6413 driver = parent->driver;
6414 if (driver && driver->name)
6415 return driver->name;
6419 static int __netdev_printk(const char *level, const struct net_device *dev,
6420 struct va_format *vaf)
6424 if (dev && dev->dev.parent) {
6425 r = dev_printk_emit(level[1] - '0',
6428 dev_driver_string(dev->dev.parent),
6429 dev_name(dev->dev.parent),
6430 netdev_name(dev), vaf);
6432 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6434 r = printk("%s(NULL net_device): %pV", level, vaf);
6440 int netdev_printk(const char *level, const struct net_device *dev,
6441 const char *format, ...)
6443 struct va_format vaf;
6447 va_start(args, format);
6452 r = __netdev_printk(level, dev, &vaf);
6458 EXPORT_SYMBOL(netdev_printk);
6460 #define define_netdev_printk_level(func, level) \
6461 int func(const struct net_device *dev, const char *fmt, ...) \
6464 struct va_format vaf; \
6467 va_start(args, fmt); \
6472 r = __netdev_printk(level, dev, &vaf); \
6478 EXPORT_SYMBOL(func);
6480 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6481 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6482 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6483 define_netdev_printk_level(netdev_err, KERN_ERR);
6484 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6485 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6486 define_netdev_printk_level(netdev_info, KERN_INFO);
6488 static void __net_exit netdev_exit(struct net *net)
6490 kfree(net->dev_name_head);
6491 kfree(net->dev_index_head);
6494 static struct pernet_operations __net_initdata netdev_net_ops = {
6495 .init = netdev_init,
6496 .exit = netdev_exit,
6499 static void __net_exit default_device_exit(struct net *net)
6501 struct net_device *dev, *aux;
6503 * Push all migratable network devices back to the
6504 * initial network namespace
6507 for_each_netdev_safe(net, dev, aux) {
6509 char fb_name[IFNAMSIZ];
6511 /* Ignore unmoveable devices (i.e. loopback) */
6512 if (dev->features & NETIF_F_NETNS_LOCAL)
6515 /* Leave virtual devices for the generic cleanup */
6516 if (dev->rtnl_link_ops)
6519 /* Push remaining network devices to init_net */
6520 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6521 err = dev_change_net_namespace(dev, &init_net, fb_name);
6523 pr_emerg("%s: failed to move %s to init_net: %d\n",
6524 __func__, dev->name, err);
6531 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6533 /* At exit all network devices most be removed from a network
6534 * namespace. Do this in the reverse order of registration.
6535 * Do this across as many network namespaces as possible to
6536 * improve batching efficiency.
6538 struct net_device *dev;
6540 LIST_HEAD(dev_kill_list);
6543 list_for_each_entry(net, net_list, exit_list) {
6544 for_each_netdev_reverse(net, dev) {
6545 if (dev->rtnl_link_ops)
6546 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6548 unregister_netdevice_queue(dev, &dev_kill_list);
6551 unregister_netdevice_many(&dev_kill_list);
6552 list_del(&dev_kill_list);
6556 static struct pernet_operations __net_initdata default_device_ops = {
6557 .exit = default_device_exit,
6558 .exit_batch = default_device_exit_batch,
6562 * Initialize the DEV module. At boot time this walks the device list and
6563 * unhooks any devices that fail to initialise (normally hardware not
6564 * present) and leaves us with a valid list of present and active devices.
6569 * This is called single threaded during boot, so no need
6570 * to take the rtnl semaphore.
6572 static int __init net_dev_init(void)
6574 int i, rc = -ENOMEM;
6576 BUG_ON(!dev_boot_phase);
6578 if (dev_proc_init())
6581 if (netdev_kobject_init())
6584 INIT_LIST_HEAD(&ptype_all);
6585 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6586 INIT_LIST_HEAD(&ptype_base[i]);
6588 INIT_LIST_HEAD(&offload_base);
6590 if (register_pernet_subsys(&netdev_net_ops))
6594 * Initialise the packet receive queues.
6597 for_each_possible_cpu(i) {
6598 struct softnet_data *sd = &per_cpu(softnet_data, i);
6600 memset(sd, 0, sizeof(*sd));
6601 skb_queue_head_init(&sd->input_pkt_queue);
6602 skb_queue_head_init(&sd->process_queue);
6603 sd->completion_queue = NULL;
6604 INIT_LIST_HEAD(&sd->poll_list);
6605 sd->output_queue = NULL;
6606 sd->output_queue_tailp = &sd->output_queue;
6608 sd->csd.func = rps_trigger_softirq;
6614 sd->backlog.poll = process_backlog;
6615 sd->backlog.weight = weight_p;
6616 sd->backlog.gro_list = NULL;
6617 sd->backlog.gro_count = 0;
6622 /* The loopback device is special if any other network devices
6623 * is present in a network namespace the loopback device must
6624 * be present. Since we now dynamically allocate and free the
6625 * loopback device ensure this invariant is maintained by
6626 * keeping the loopback device as the first device on the
6627 * list of network devices. Ensuring the loopback devices
6628 * is the first device that appears and the last network device
6631 if (register_pernet_device(&loopback_net_ops))
6634 if (register_pernet_device(&default_device_ops))
6637 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6638 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6640 hotcpu_notifier(dev_cpu_callback, 0);
6648 subsys_initcall(net_dev_init);