net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132
 133 #include "net-sysfs.h"
 134
 135 /* Instead of increasing this, you should create a hash table. */
 136 #define MAX_GRO_SKBS 8
 137
 138 /* This should be increased if a protocol with a bigger head is added. */
 139 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 140
 141 static DEFINE_SPINLOCK(ptype_lock);
 142 static DEFINE_SPINLOCK(offload_lock);
 143 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 144 struct list_head ptype_all __read_mostly;       /* Taps */
 145 static struct list_head offload_base __read_mostly;
 146
 147 /*
 148  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 149  * semaphore.
 150  *
 151  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 152  *
 153  * Writers must hold the rtnl semaphore while they loop through the
 154  * dev_base_head list, and hold dev_base_lock for writing when they do the
 155  * actual updates.  This allows pure readers to access the list even
 156  * while a writer is preparing to update it.
 157  *
 158  * To put it another way, dev_base_lock is held for writing only to
 159  * protect against pure readers; the rtnl semaphore provides the
 160  * protection against other writers.
 161  *
 162  * See, for example usages, register_netdevice() and
 163  * unregister_netdevice(), which must be called with the rtnl
 164  * semaphore held.
 165  */
 166 DEFINE_RWLOCK(dev_base_lock);
 167 EXPORT_SYMBOL(dev_base_lock);
 168
 169 seqcount_t devnet_rename_seq;
 170
 171 static inline void dev_base_seq_inc(struct net *net)
 172 {
 173         while (++net->dev_base_seq == 0);
 174 }
 175
 176 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 177 {
 178         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 179
 180         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 181 }
 182
 183 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 184 {
 185         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 186 }
 187
 188 static inline void rps_lock(struct softnet_data *sd)
 189 {
 190 #ifdef CONFIG_RPS
 191         spin_lock(&sd->input_pkt_queue.lock);
 192 #endif
 193 }
 194
 195 static inline void rps_unlock(struct softnet_data *sd)
 196 {
 197 #ifdef CONFIG_RPS
 198         spin_unlock(&sd->input_pkt_queue.lock);
 199 #endif
 200 }
 201
 202 /* Device list insertion */
 203 static void list_netdevice(struct net_device *dev)
 204 {
 205         struct net *net = dev_net(dev);
 206
 207         ASSERT_RTNL();
 208
 209         write_lock_bh(&dev_base_lock);
 210         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 211         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 212         hlist_add_head_rcu(&dev->index_hlist,
 213                            dev_index_hash(net, dev->ifindex));
 214         write_unlock_bh(&dev_base_lock);
 215
 216         dev_base_seq_inc(net);
 217 }
 218
 219 /* Device list removal
 220  * caller must respect a RCU grace period before freeing/reusing dev
 221  */
 222 static void unlist_netdevice(struct net_device *dev)
 223 {
 224         ASSERT_RTNL();
 225
 226         /* Unlink dev from the device chain */
 227         write_lock_bh(&dev_base_lock);
 228         list_del_rcu(&dev->dev_list);
 229         hlist_del_rcu(&dev->name_hlist);
 230         hlist_del_rcu(&dev->index_hlist);
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(dev_net(dev));
 234 }
 235
 236 /*
 237  *      Our notifier list
 238  */
 239
 240 static RAW_NOTIFIER_HEAD(netdev_chain);
 241
 242 /*
 243  *      Device drivers call our routines to queue packets here. We empty the
 244  *      queue in the local softnet handler.
 245  */
 246
 247 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 248 EXPORT_PER_CPU_SYMBOL(softnet_data);
 249
 250 #ifdef CONFIG_LOCKDEP
 251 /*
 252  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 253  * according to dev->type
 254  */
 255 static const unsigned short netdev_lock_type[] =
 256         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 257          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 258          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 259          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 260          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 261          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 262          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 263          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 264          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 265          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 266          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 267          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 268          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 269          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 270          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 271
 272 static const char *const netdev_lock_name[] =
 273         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 274          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 275          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 276          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 277          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 278          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 279          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 280          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 281          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 282          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 283          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 284          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 285          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 286          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 287          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 288
 289 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 290 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 291
 292 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 293 {
 294         int i;
 295
 296         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 297                 if (netdev_lock_type[i] == dev_type)
 298                         return i;
 299         /* the last key is used by default */
 300         return ARRAY_SIZE(netdev_lock_type) - 1;
 301 }
 302
 303 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 304                                                  unsigned short dev_type)
 305 {
 306         int i;
 307
 308         i = netdev_lock_pos(dev_type);
 309         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 310                                    netdev_lock_name[i]);
 311 }
 312
 313 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 314 {
 315         int i;
 316
 317         i = netdev_lock_pos(dev->type);
 318         lockdep_set_class_and_name(&dev->addr_list_lock,
 319                                    &netdev_addr_lock_key[i],
 320                                    netdev_lock_name[i]);
 321 }
 322 #else
 323 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 324                                                  unsigned short dev_type)
 325 {
 326 }
 327 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 328 {
 329 }
 330 #endif
 331
 332 /*******************************************************************************
 333
 334                 Protocol management and registration routines
 335
 336 *******************************************************************************/
 337
 338 /*
 339  *      Add a protocol ID to the list. Now that the input handler is
 340  *      smarter we can dispense with all the messy stuff that used to be
 341  *      here.
 342  *
 343  *      BEWARE!!! Protocol handlers, mangling input packets,
 344  *      MUST BE last in hash buckets and checking protocol handlers
 345  *      MUST start from promiscuous ptype_all chain in net_bh.
 346  *      It is true now, do not change it.
 347  *      Explanation follows: if protocol handler, mangling packet, will
 348  *      be the first on list, it is not able to sense, that packet
 349  *      is cloned and should be copied-on-write, so that it will
 350  *      change it and subsequent readers will get broken packet.
 351  *                                                      --ANK (980803)
 352  */
 353
 354 static inline struct list_head *ptype_head(const struct packet_type *pt)
 355 {
 356         if (pt->type == htons(ETH_P_ALL))
 357                 return &ptype_all;
 358         else
 359                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 360 }
 361
 362 /**
 363  *      dev_add_pack - add packet handler
 364  *      @pt: packet type declaration
 365  *
 366  *      Add a protocol handler to the networking stack. The passed &packet_type
 367  *      is linked into kernel lists and may not be freed until it has been
 368  *      removed from the kernel lists.
 369  *
 370  *      This call does not sleep therefore it can not
 371  *      guarantee all CPU's that are in middle of receiving packets
 372  *      will see the new packet type (until the next received packet).
 373  */
 374
 375 void dev_add_pack(struct packet_type *pt)
 376 {
 377         struct list_head *head = ptype_head(pt);
 378
 379         spin_lock(&ptype_lock);
 380         list_add_rcu(&pt->list, head);
 381         spin_unlock(&ptype_lock);
 382 }
 383 EXPORT_SYMBOL(dev_add_pack);
 384
 385 /**
 386  *      __dev_remove_pack        - remove packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Remove a protocol handler that was previously added to the kernel
 390  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 391  *      from the kernel lists and can be freed or reused once this function
 392  *      returns.
 393  *
 394  *      The packet type might still be in use by receivers
 395  *      and must not be freed until after all the CPU's have gone
 396  *      through a quiescent state.
 397  */
 398 void __dev_remove_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401         struct packet_type *pt1;
 402
 403         spin_lock(&ptype_lock);
 404
 405         list_for_each_entry(pt1, head, list) {
 406                 if (pt == pt1) {
 407                         list_del_rcu(&pt->list);
 408                         goto out;
 409                 }
 410         }
 411
 412         pr_warn("dev_remove_pack: %p not found\n", pt);
 413 out:
 414         spin_unlock(&ptype_lock);
 415 }
 416 EXPORT_SYMBOL(__dev_remove_pack);
 417
 418 /**
 419  *      dev_remove_pack  - remove packet handler
 420  *      @pt: packet type declaration
 421  *
 422  *      Remove a protocol handler that was previously added to the kernel
 423  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 424  *      from the kernel lists and can be freed or reused once this function
 425  *      returns.
 426  *
 427  *      This call sleeps to guarantee that no CPU is looking at the packet
 428  *      type after return.
 429  */
 430 void dev_remove_pack(struct packet_type *pt)
 431 {
 432         __dev_remove_pack(pt);
 433
 434         synchronize_net();
 435 }
 436 EXPORT_SYMBOL(dev_remove_pack);
 437
 438
 439 /**
 440  *      dev_add_offload - register offload handlers
 441  *      @po: protocol offload declaration
 442  *
 443  *      Add protocol offload handlers to the networking stack. The passed
 444  *      &proto_offload is linked into kernel lists and may not be freed until
 445  *      it has been removed from the kernel lists.
 446  *
 447  *      This call does not sleep therefore it can not
 448  *      guarantee all CPU's that are in middle of receiving packets
 449  *      will see the new offload handlers (until the next received packet).
 450  */
 451 void dev_add_offload(struct packet_offload *po)
 452 {
 453         struct list_head *head = &offload_base;
 454
 455         spin_lock(&offload_lock);
 456         list_add_rcu(&po->list, head);
 457         spin_unlock(&offload_lock);
 458 }
 459 EXPORT_SYMBOL(dev_add_offload);
 460
 461 /**
 462  *      __dev_remove_offload     - remove offload handler
 463  *      @po: packet offload declaration
 464  *
 465  *      Remove a protocol offload handler that was previously added to the
 466  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 467  *      is removed from the kernel lists and can be freed or reused once this
 468  *      function returns.
 469  *
 470  *      The packet type might still be in use by receivers
 471  *      and must not be freed until after all the CPU's have gone
 472  *      through a quiescent state.
 473  */
 474 void __dev_remove_offload(struct packet_offload *po)
 475 {
 476         struct list_head *head = &offload_base;
 477         struct packet_offload *po1;
 478
 479         spin_lock(&offload_lock);
 480
 481         list_for_each_entry(po1, head, list) {
 482                 if (po == po1) {
 483                         list_del_rcu(&po->list);
 484                         goto out;
 485                 }
 486         }
 487
 488         pr_warn("dev_remove_offload: %p not found\n", po);
 489 out:
 490         spin_unlock(&offload_lock);
 491 }
 492 EXPORT_SYMBOL(__dev_remove_offload);
 493
 494 /**
 495  *      dev_remove_offload       - remove packet offload handler
 496  *      @po: packet offload declaration
 497  *
 498  *      Remove a packet offload handler that was previously added to the kernel
 499  *      offload handlers by dev_add_offload(). The passed &offload_type is
 500  *      removed from the kernel lists and can be freed or reused once this
 501  *      function returns.
 502  *
 503  *      This call sleeps to guarantee that no CPU is looking at the packet
 504  *      type after return.
 505  */
 506 void dev_remove_offload(struct packet_offload *po)
 507 {
 508         __dev_remove_offload(po);
 509
 510         synchronize_net();
 511 }
 512 EXPORT_SYMBOL(dev_remove_offload);
 513
 514 /******************************************************************************
 515
 516                       Device Boot-time Settings Routines
 517
 518 *******************************************************************************/
 519
 520 /* Boot time configuration table */
 521 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 522
 523 /**
 524  *      netdev_boot_setup_add   - add new setup entry
 525  *      @name: name of the device
 526  *      @map: configured settings for the device
 527  *
 528  *      Adds new setup entry to the dev_boot_setup list.  The function
 529  *      returns 0 on error and 1 on success.  This is a generic routine to
 530  *      all netdevices.
 531  */
 532 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 533 {
 534         struct netdev_boot_setup *s;
 535         int i;
 536
 537         s = dev_boot_setup;
 538         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 539                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 540                         memset(s[i].name, 0, sizeof(s[i].name));
 541                         strlcpy(s[i].name, name, IFNAMSIZ);
 542                         memcpy(&s[i].map, map, sizeof(s[i].map));
 543                         break;
 544                 }
 545         }
 546
 547         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 548 }
 549
 550 /**
 551  *      netdev_boot_setup_check - check boot time settings
 552  *      @dev: the netdevice
 553  *
 554  *      Check boot time settings for the device.
 555  *      The found settings are set for the device to be used
 556  *      later in the device probing.
 557  *      Returns 0 if no settings found, 1 if they are.
 558  */
 559 int netdev_boot_setup_check(struct net_device *dev)
 560 {
 561         struct netdev_boot_setup *s = dev_boot_setup;
 562         int i;
 563
 564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 566                     !strcmp(dev->name, s[i].name)) {
 567                         dev->irq        = s[i].map.irq;
 568                         dev->base_addr  = s[i].map.base_addr;
 569                         dev->mem_start  = s[i].map.mem_start;
 570                         dev->mem_end    = s[i].map.mem_end;
 571                         return 1;
 572                 }
 573         }
 574         return 0;
 575 }
 576 EXPORT_SYMBOL(netdev_boot_setup_check);
 577
 578
 579 /**
 580  *      netdev_boot_base        - get address from boot time settings
 581  *      @prefix: prefix for network device
 582  *      @unit: id for network device
 583  *
 584  *      Check boot time settings for the base address of device.
 585  *      The found settings are set for the device to be used
 586  *      later in the device probing.
 587  *      Returns 0 if no settings found.
 588  */
 589 unsigned long netdev_boot_base(const char *prefix, int unit)
 590 {
 591         const struct netdev_boot_setup *s = dev_boot_setup;
 592         char name[IFNAMSIZ];
 593         int i;
 594
 595         sprintf(name, "%s%d", prefix, unit);
 596
 597         /*
 598          * If device already registered then return base of 1
 599          * to indicate not to probe for this interface
 600          */
 601         if (__dev_get_by_name(&init_net, name))
 602                 return 1;
 603
 604         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 605                 if (!strcmp(name, s[i].name))
 606                         return s[i].map.base_addr;
 607         return 0;
 608 }
 609
 610 /*
 611  * Saves at boot time configured settings for any netdevice.
 612  */
 613 int __init netdev_boot_setup(char *str)
 614 {
 615         int ints[5];
 616         struct ifmap map;
 617
 618         str = get_options(str, ARRAY_SIZE(ints), ints);
 619         if (!str || !*str)
 620                 return 0;
 621
 622         /* Save settings */
 623         memset(&map, 0, sizeof(map));
 624         if (ints[0] > 0)
 625                 map.irq = ints[1];
 626         if (ints[0] > 1)
 627                 map.base_addr = ints[2];
 628         if (ints[0] > 2)
 629                 map.mem_start = ints[3];
 630         if (ints[0] > 3)
 631                 map.mem_end = ints[4];
 632
 633         /* Add new entry to the list */
 634         return netdev_boot_setup_add(str, &map);
 635 }
 636
 637 __setup("netdev=", netdev_boot_setup);
 638
 639 /*******************************************************************************
 640
 641                             Device Interface Subroutines
 642
 643 *******************************************************************************/
 644
 645 /**
 646  *      __dev_get_by_name       - find a device by its name
 647  *      @net: the applicable net namespace
 648  *      @name: name to find
 649  *
 650  *      Find an interface by name. Must be called under RTNL semaphore
 651  *      or @dev_base_lock. If the name is found a pointer to the device
 652  *      is returned. If the name is not found then %NULL is returned. The
 653  *      reference counters are not incremented so the caller must be
 654  *      careful with locks.
 655  */
 656
 657 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 658 {
 659         struct net_device *dev;
 660         struct hlist_head *head = dev_name_hash(net, name);
 661
 662         hlist_for_each_entry(dev, head, name_hlist)
 663                 if (!strncmp(dev->name, name, IFNAMSIZ))
 664                         return dev;
 665
 666         return NULL;
 667 }
 668 EXPORT_SYMBOL(__dev_get_by_name);
 669
 670 /**
 671  *      dev_get_by_name_rcu     - find a device by its name
 672  *      @net: the applicable net namespace
 673  *      @name: name to find
 674  *
 675  *      Find an interface by name.
 676  *      If the name is found a pointer to the device is returned.
 677  *      If the name is not found then %NULL is returned.
 678  *      The reference counters are not incremented so the caller must be
 679  *      careful with locks. The caller must hold RCU lock.
 680  */
 681
 682 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 683 {
 684         struct net_device *dev;
 685         struct hlist_head *head = dev_name_hash(net, name);
 686
 687         hlist_for_each_entry_rcu(dev, head, name_hlist)
 688                 if (!strncmp(dev->name, name, IFNAMSIZ))
 689                         return dev;
 690
 691         return NULL;
 692 }
 693 EXPORT_SYMBOL(dev_get_by_name_rcu);
 694
 695 /**
 696  *      dev_get_by_name         - find a device by its name
 697  *      @net: the applicable net namespace
 698  *      @name: name to find
 699  *
 700  *      Find an interface by name. This can be called from any
 701  *      context and does its own locking. The returned handle has
 702  *      the usage count incremented and the caller must use dev_put() to
 703  *      release it when it is no longer needed. %NULL is returned if no
 704  *      matching device is found.
 705  */
 706
 707 struct net_device *dev_get_by_name(struct net *net, const char *name)
 708 {
 709         struct net_device *dev;
 710
 711         rcu_read_lock();
 712         dev = dev_get_by_name_rcu(net, name);
 713         if (dev)
 714                 dev_hold(dev);
 715         rcu_read_unlock();
 716         return dev;
 717 }
 718 EXPORT_SYMBOL(dev_get_by_name);
 719
 720 /**
 721  *      __dev_get_by_index - find a device by its ifindex
 722  *      @net: the applicable net namespace
 723  *      @ifindex: index of device
 724  *
 725  *      Search for an interface by index. Returns %NULL if the device
 726  *      is not found or a pointer to the device. The device has not
 727  *      had its reference counter increased so the caller must be careful
 728  *      about locking. The caller must hold either the RTNL semaphore
 729  *      or @dev_base_lock.
 730  */
 731
 732 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 733 {
 734         struct net_device *dev;
 735         struct hlist_head *head = dev_index_hash(net, ifindex);
 736
 737         hlist_for_each_entry(dev, head, index_hlist)
 738                 if (dev->ifindex == ifindex)
 739                         return dev;
 740
 741         return NULL;
 742 }
 743 EXPORT_SYMBOL(__dev_get_by_index);
 744
 745 /**
 746  *      dev_get_by_index_rcu - find a device by its ifindex
 747  *      @net: the applicable net namespace
 748  *      @ifindex: index of device
 749  *
 750  *      Search for an interface by index. Returns %NULL if the device
 751  *      is not found or a pointer to the device. The device has not
 752  *      had its reference counter increased so the caller must be careful
 753  *      about locking. The caller must hold RCU lock.
 754  */
 755
 756 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 757 {
 758         struct net_device *dev;
 759         struct hlist_head *head = dev_index_hash(net, ifindex);
 760
 761         hlist_for_each_entry_rcu(dev, head, index_hlist)
 762                 if (dev->ifindex == ifindex)
 763                         return dev;
 764
 765         return NULL;
 766 }
 767 EXPORT_SYMBOL(dev_get_by_index_rcu);
 768
 769
 770 /**
 771  *      dev_get_by_index - find a device by its ifindex
 772  *      @net: the applicable net namespace
 773  *      @ifindex: index of device
 774  *
 775  *      Search for an interface by index. Returns NULL if the device
 776  *      is not found or a pointer to the device. The device returned has
 777  *      had a reference added and the pointer is safe until the user calls
 778  *      dev_put to indicate they have finished with it.
 779  */
 780
 781 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 782 {
 783         struct net_device *dev;
 784
 785         rcu_read_lock();
 786         dev = dev_get_by_index_rcu(net, ifindex);
 787         if (dev)
 788                 dev_hold(dev);
 789         rcu_read_unlock();
 790         return dev;
 791 }
 792 EXPORT_SYMBOL(dev_get_by_index);
 793
 794 /**
 795  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 796  *      @net: the applicable net namespace
 797  *      @type: media type of device
 798  *      @ha: hardware address
 799  *
 800  *      Search for an interface by MAC address. Returns NULL if the device
 801  *      is not found or a pointer to the device.
 802  *      The caller must hold RCU or RTNL.
 803  *      The returned device has not had its ref count increased
 804  *      and the caller must therefore be careful about locking
 805  *
 806  */
 807
 808 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 809                                        const char *ha)
 810 {
 811         struct net_device *dev;
 812
 813         for_each_netdev_rcu(net, dev)
 814                 if (dev->type == type &&
 815                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 816                         return dev;
 817
 818         return NULL;
 819 }
 820 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 821
 822 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 823 {
 824         struct net_device *dev;
 825
 826         ASSERT_RTNL();
 827         for_each_netdev(net, dev)
 828                 if (dev->type == type)
 829                         return dev;
 830
 831         return NULL;
 832 }
 833 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 834
 835 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 836 {
 837         struct net_device *dev, *ret = NULL;
 838
 839         rcu_read_lock();
 840         for_each_netdev_rcu(net, dev)
 841                 if (dev->type == type) {
 842                         dev_hold(dev);
 843                         ret = dev;
 844                         break;
 845                 }
 846         rcu_read_unlock();
 847         return ret;
 848 }
 849 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 850
 851 /**
 852  *      dev_get_by_flags_rcu - find any device with given flags
 853  *      @net: the applicable net namespace
 854  *      @if_flags: IFF_* values
 855  *      @mask: bitmask of bits in if_flags to check
 856  *
 857  *      Search for any interface with the given flags. Returns NULL if a device
 858  *      is not found or a pointer to the device. Must be called inside
 859  *      rcu_read_lock(), and result refcount is unchanged.
 860  */
 861
 862 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 863                                     unsigned short mask)
 864 {
 865         struct net_device *dev, *ret;
 866
 867         ret = NULL;
 868         for_each_netdev_rcu(net, dev) {
 869                 if (((dev->flags ^ if_flags) & mask) == 0) {
 870                         ret = dev;
 871                         break;
 872                 }
 873         }
 874         return ret;
 875 }
 876 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 877
 878 /**
 879  *      dev_valid_name - check if name is okay for network device
 880  *      @name: name string
 881  *
 882  *      Network device names need to be valid file names to
 883  *      to allow sysfs to work.  We also disallow any kind of
 884  *      whitespace.
 885  */
 886 bool dev_valid_name(const char *name)
 887 {
 888         if (*name == '\0')
 889                 return false;
 890         if (strlen(name) >= IFNAMSIZ)
 891                 return false;
 892         if (!strcmp(name, ".") || !strcmp(name, ".."))
 893                 return false;
 894
 895         while (*name) {
 896                 if (*name == '/' || isspace(*name))
 897                         return false;
 898                 name++;
 899         }
 900         return true;
 901 }
 902 EXPORT_SYMBOL(dev_valid_name);
 903
 904 /**
 905  *      __dev_alloc_name - allocate a name for a device
 906  *      @net: network namespace to allocate the device name in
 907  *      @name: name format string
 908  *      @buf:  scratch buffer and result name string
 909  *
 910  *      Passed a format string - eg "lt%d" it will try and find a suitable
 911  *      id. It scans list of devices to build up a free map, then chooses
 912  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 913  *      while allocating the name and adding the device in order to avoid
 914  *      duplicates.
 915  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 916  *      Returns the number of the unit assigned or a negative errno code.
 917  */
 918
 919 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 920 {
 921         int i = 0;
 922         const char *p;
 923         const int max_netdevices = 8*PAGE_SIZE;
 924         unsigned long *inuse;
 925         struct net_device *d;
 926
 927         p = strnchr(name, IFNAMSIZ-1, '%');
 928         if (p) {
 929                 /*
 930                  * Verify the string as this thing may have come from
 931                  * the user.  There must be either one "%d" and no other "%"
 932                  * characters.
 933                  */
 934                 if (p[1] != 'd' || strchr(p + 2, '%'))
 935                         return -EINVAL;
 936
 937                 /* Use one page as a bit array of possible slots */
 938                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 939                 if (!inuse)
 940                         return -ENOMEM;
 941
 942                 for_each_netdev(net, d) {
 943                         if (!sscanf(d->name, name, &i))
 944                                 continue;
 945                         if (i < 0 || i >= max_netdevices)
 946                                 continue;
 947
 948                         /*  avoid cases where sscanf is not exact inverse of printf */
 949                         snprintf(buf, IFNAMSIZ, name, i);
 950                         if (!strncmp(buf, d->name, IFNAMSIZ))
 951                                 set_bit(i, inuse);
 952                 }
 953
 954                 i = find_first_zero_bit(inuse, max_netdevices);
 955                 free_page((unsigned long) inuse);
 956         }
 957
 958         if (buf != name)
 959                 snprintf(buf, IFNAMSIZ, name, i);
 960         if (!__dev_get_by_name(net, buf))
 961                 return i;
 962
 963         /* It is possible to run out of possible slots
 964          * when the name is long and there isn't enough space left
 965          * for the digits, or if all bits are used.
 966          */
 967         return -ENFILE;
 968 }
 969
 970 /**
 971  *      dev_alloc_name - allocate a name for a device
 972  *      @dev: device
 973  *      @name: name format string
 974  *
 975  *      Passed a format string - eg "lt%d" it will try and find a suitable
 976  *      id. It scans list of devices to build up a free map, then chooses
 977  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 978  *      while allocating the name and adding the device in order to avoid
 979  *      duplicates.
 980  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 981  *      Returns the number of the unit assigned or a negative errno code.
 982  */
 983
 984 int dev_alloc_name(struct net_device *dev, const char *name)
 985 {
 986         char buf[IFNAMSIZ];
 987         struct net *net;
 988         int ret;
 989
 990         BUG_ON(!dev_net(dev));
 991         net = dev_net(dev);
 992         ret = __dev_alloc_name(net, name, buf);
 993         if (ret >= 0)
 994                 strlcpy(dev->name, buf, IFNAMSIZ);
 995         return ret;
 996 }
 997 EXPORT_SYMBOL(dev_alloc_name);
 998
 999 static int dev_alloc_name_ns(struct net *net,
1000                              struct net_device *dev,
1001                              const char *name)
1002 {
1003         char buf[IFNAMSIZ];
1004         int ret;
1005
1006         ret = __dev_alloc_name(net, name, buf);
1007         if (ret >= 0)
1008                 strlcpy(dev->name, buf, IFNAMSIZ);
1009         return ret;
1010 }
1011
1012 static int dev_get_valid_name(struct net *net,
1013                               struct net_device *dev,
1014                               const char *name)
1015 {
1016         BUG_ON(!net);
1017
1018         if (!dev_valid_name(name))
1019                 return -EINVAL;
1020
1021         if (strchr(name, '%'))
1022                 return dev_alloc_name_ns(net, dev, name);
1023         else if (__dev_get_by_name(net, name))
1024                 return -EEXIST;
1025         else if (dev->name != name)
1026                 strlcpy(dev->name, name, IFNAMSIZ);
1027
1028         return 0;
1029 }
1030
1031 /**
1032  *      dev_change_name - change name of a device
1033  *      @dev: device
1034  *      @newname: name (or format string) must be at least IFNAMSIZ
1035  *
1036  *      Change name of a device, can pass format strings "eth%d".
1037  *      for wildcarding.
1038  */
1039 int dev_change_name(struct net_device *dev, const char *newname)
1040 {
1041         char oldname[IFNAMSIZ];
1042         int err = 0;
1043         int ret;
1044         struct net *net;
1045
1046         ASSERT_RTNL();
1047         BUG_ON(!dev_net(dev));
1048
1049         net = dev_net(dev);
1050         if (dev->flags & IFF_UP)
1051                 return -EBUSY;
1052
1053         write_seqcount_begin(&devnet_rename_seq);
1054
1055         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1056                 write_seqcount_end(&devnet_rename_seq);
1057                 return 0;
1058         }
1059
1060         memcpy(oldname, dev->name, IFNAMSIZ);
1061
1062         err = dev_get_valid_name(net, dev, newname);
1063         if (err < 0) {
1064                 write_seqcount_end(&devnet_rename_seq);
1065                 return err;
1066         }
1067
1068 rollback:
1069         ret = device_rename(&dev->dev, dev->name);
1070         if (ret) {
1071                 memcpy(dev->name, oldname, IFNAMSIZ);
1072                 write_seqcount_end(&devnet_rename_seq);
1073                 return ret;
1074         }
1075
1076         write_seqcount_end(&devnet_rename_seq);
1077
1078         write_lock_bh(&dev_base_lock);
1079         hlist_del_rcu(&dev->name_hlist);
1080         write_unlock_bh(&dev_base_lock);
1081
1082         synchronize_rcu();
1083
1084         write_lock_bh(&dev_base_lock);
1085         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1086         write_unlock_bh(&dev_base_lock);
1087
1088         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1089         ret = notifier_to_errno(ret);
1090
1091         if (ret) {
1092                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1093                 if (err >= 0) {
1094                         err = ret;
1095                         write_seqcount_begin(&devnet_rename_seq);
1096                         memcpy(dev->name, oldname, IFNAMSIZ);
1097                         goto rollback;
1098                 } else {
1099                         pr_err("%s: name change rollback failed: %d\n",
1100                                dev->name, ret);
1101                 }
1102         }
1103
1104         return err;
1105 }
1106
1107 /**
1108  *      dev_set_alias - change ifalias of a device
1109  *      @dev: device
1110  *      @alias: name up to IFALIASZ
1111  *      @len: limit of bytes to copy from info
1112  *
1113  *      Set ifalias for a device,
1114  */
1115 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1116 {
1117         char *new_ifalias;
1118
1119         ASSERT_RTNL();
1120
1121         if (len >= IFALIASZ)
1122                 return -EINVAL;
1123
1124         if (!len) {
1125                 kfree(dev->ifalias);
1126                 dev->ifalias = NULL;
1127                 return 0;
1128         }
1129
1130         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1131         if (!new_ifalias)
1132                 return -ENOMEM;
1133         dev->ifalias = new_ifalias;
1134
1135         strlcpy(dev->ifalias, alias, len+1);
1136         return len;
1137 }
1138
1139
1140 /**
1141  *      netdev_features_change - device changes features
1142  *      @dev: device to cause notification
1143  *
1144  *      Called to indicate a device has changed features.
1145  */
1146 void netdev_features_change(struct net_device *dev)
1147 {
1148         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1149 }
1150 EXPORT_SYMBOL(netdev_features_change);
1151
1152 /**
1153  *      netdev_state_change - device changes state
1154  *      @dev: device to cause notification
1155  *
1156  *      Called to indicate a device has changed state. This function calls
1157  *      the notifier chains for netdev_chain and sends a NEWLINK message
1158  *      to the routing socket.
1159  */
1160 void netdev_state_change(struct net_device *dev)
1161 {
1162         if (dev->flags & IFF_UP) {
1163                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1164                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1165         }
1166 }
1167 EXPORT_SYMBOL(netdev_state_change);
1168
1169 /**
1170  *      netdev_notify_peers - notify network peers about existence of @dev
1171  *      @dev: network device
1172  *
1173  * Generate traffic such that interested network peers are aware of
1174  * @dev, such as by generating a gratuitous ARP. This may be used when
1175  * a device wants to inform the rest of the network about some sort of
1176  * reconfiguration such as a failover event or virtual machine
1177  * migration.
1178  */
1179 void netdev_notify_peers(struct net_device *dev)
1180 {
1181         rtnl_lock();
1182         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1183         rtnl_unlock();
1184 }
1185 EXPORT_SYMBOL(netdev_notify_peers);
1186
1187 static int __dev_open(struct net_device *dev)
1188 {
1189         const struct net_device_ops *ops = dev->netdev_ops;
1190         int ret;
1191
1192         ASSERT_RTNL();
1193
1194         if (!netif_device_present(dev))
1195                 return -ENODEV;
1196
1197         /* Block netpoll from trying to do any rx path servicing.
1198          * If we don't do this there is a chance ndo_poll_controller
1199          * or ndo_poll may be running while we open the device
1200          */
1201         ret = netpoll_rx_disable(dev);
1202         if (ret)
1203                 return ret;
1204
1205         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1206         ret = notifier_to_errno(ret);
1207         if (ret)
1208                 return ret;
1209
1210         set_bit(__LINK_STATE_START, &dev->state);
1211
1212         if (ops->ndo_validate_addr)
1213                 ret = ops->ndo_validate_addr(dev);
1214
1215         if (!ret && ops->ndo_open)
1216                 ret = ops->ndo_open(dev);
1217
1218         netpoll_rx_enable(dev);
1219
1220         if (ret)
1221                 clear_bit(__LINK_STATE_START, &dev->state);
1222         else {
1223                 dev->flags |= IFF_UP;
1224                 net_dmaengine_get();
1225                 dev_set_rx_mode(dev);
1226                 dev_activate(dev);
1227                 add_device_randomness(dev->dev_addr, dev->addr_len);
1228         }
1229
1230         return ret;
1231 }
1232
1233 /**
1234  *      dev_open        - prepare an interface for use.
1235  *      @dev:   device to open
1236  *
1237  *      Takes a device from down to up state. The device's private open
1238  *      function is invoked and then the multicast lists are loaded. Finally
1239  *      the device is moved into the up state and a %NETDEV_UP message is
1240  *      sent to the netdev notifier chain.
1241  *
1242  *      Calling this function on an active interface is a nop. On a failure
1243  *      a negative errno code is returned.
1244  */
1245 int dev_open(struct net_device *dev)
1246 {
1247         int ret;
1248
1249         if (dev->flags & IFF_UP)
1250                 return 0;
1251
1252         ret = __dev_open(dev);
1253         if (ret < 0)
1254                 return ret;
1255
1256         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1257         call_netdevice_notifiers(NETDEV_UP, dev);
1258
1259         return ret;
1260 }
1261 EXPORT_SYMBOL(dev_open);
1262
1263 static int __dev_close_many(struct list_head *head)
1264 {
1265         struct net_device *dev;
1266
1267         ASSERT_RTNL();
1268         might_sleep();
1269
1270         list_for_each_entry(dev, head, unreg_list) {
1271                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1272
1273                 clear_bit(__LINK_STATE_START, &dev->state);
1274
1275                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1276                  * can be even on different cpu. So just clear netif_running().
1277                  *
1278                  * dev->stop() will invoke napi_disable() on all of it's
1279                  * napi_struct instances on this device.
1280                  */
1281                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1282         }
1283
1284         dev_deactivate_many(head);
1285
1286         list_for_each_entry(dev, head, unreg_list) {
1287                 const struct net_device_ops *ops = dev->netdev_ops;
1288
1289                 /*
1290                  *      Call the device specific close. This cannot fail.
1291                  *      Only if device is UP
1292                  *
1293                  *      We allow it to be called even after a DETACH hot-plug
1294                  *      event.
1295                  */
1296                 if (ops->ndo_stop)
1297                         ops->ndo_stop(dev);
1298
1299                 dev->flags &= ~IFF_UP;
1300                 net_dmaengine_put();
1301         }
1302
1303         return 0;
1304 }
1305
1306 static int __dev_close(struct net_device *dev)
1307 {
1308         int retval;
1309         LIST_HEAD(single);
1310
1311         /* Temporarily disable netpoll until the interface is down */
1312         retval = netpoll_rx_disable(dev);
1313         if (retval)
1314                 return retval;
1315
1316         list_add(&dev->unreg_list, &single);
1317         retval = __dev_close_many(&single);
1318         list_del(&single);
1319
1320         netpoll_rx_enable(dev);
1321         return retval;
1322 }
1323
1324 static int dev_close_many(struct list_head *head)
1325 {
1326         struct net_device *dev, *tmp;
1327         LIST_HEAD(tmp_list);
1328
1329         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1330                 if (!(dev->flags & IFF_UP))
1331                         list_move(&dev->unreg_list, &tmp_list);
1332
1333         __dev_close_many(head);
1334
1335         list_for_each_entry(dev, head, unreg_list) {
1336                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1337                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1338         }
1339
1340         /* rollback_registered_many needs the complete original list */
1341         list_splice(&tmp_list, head);
1342         return 0;
1343 }
1344
1345 /**
1346  *      dev_close - shutdown an interface.
1347  *      @dev: device to shutdown
1348  *
1349  *      This function moves an active device into down state. A
1350  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1351  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1352  *      chain.
1353  */
1354 int dev_close(struct net_device *dev)
1355 {
1356         int ret = 0;
1357         if (dev->flags & IFF_UP) {
1358                 LIST_HEAD(single);
1359
1360                 /* Block netpoll rx while the interface is going down */
1361                 ret = netpoll_rx_disable(dev);
1362                 if (ret)
1363                         return ret;
1364
1365                 list_add(&dev->unreg_list, &single);
1366                 dev_close_many(&single);
1367                 list_del(&single);
1368
1369                 netpoll_rx_enable(dev);
1370         }
1371         return ret;
1372 }
1373 EXPORT_SYMBOL(dev_close);
1374
1375
1376 /**
1377  *      dev_disable_lro - disable Large Receive Offload on a device
1378  *      @dev: device
1379  *
1380  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1381  *      called under RTNL.  This is needed if received packets may be
1382  *      forwarded to another interface.
1383  */
1384 void dev_disable_lro(struct net_device *dev)
1385 {
1386         /*
1387          * If we're trying to disable lro on a vlan device
1388          * use the underlying physical device instead
1389          */
1390         if (is_vlan_dev(dev))
1391                 dev = vlan_dev_real_dev(dev);
1392
1393         dev->wanted_features &= ~NETIF_F_LRO;
1394         netdev_update_features(dev);
1395
1396         if (unlikely(dev->features & NETIF_F_LRO))
1397                 netdev_WARN(dev, "failed to disable LRO!\n");
1398 }
1399 EXPORT_SYMBOL(dev_disable_lro);
1400
1401
1402 static int dev_boot_phase = 1;
1403
1404 /**
1405  *      register_netdevice_notifier - register a network notifier block
1406  *      @nb: notifier
1407  *
1408  *      Register a notifier to be called when network device events occur.
1409  *      The notifier passed is linked into the kernel structures and must
1410  *      not be reused until it has been unregistered. A negative errno code
1411  *      is returned on a failure.
1412  *
1413  *      When registered all registration and up events are replayed
1414  *      to the new notifier to allow device to have a race free
1415  *      view of the network device list.
1416  */
1417
1418 int register_netdevice_notifier(struct notifier_block *nb)
1419 {
1420         struct net_device *dev;
1421         struct net_device *last;
1422         struct net *net;
1423         int err;
1424
1425         rtnl_lock();
1426         err = raw_notifier_chain_register(&netdev_chain, nb);
1427         if (err)
1428                 goto unlock;
1429         if (dev_boot_phase)
1430                 goto unlock;
1431         for_each_net(net) {
1432                 for_each_netdev(net, dev) {
1433                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1434                         err = notifier_to_errno(err);
1435                         if (err)
1436                                 goto rollback;
1437
1438                         if (!(dev->flags & IFF_UP))
1439                                 continue;
1440
1441                         nb->notifier_call(nb, NETDEV_UP, dev);
1442                 }
1443         }
1444
1445 unlock:
1446         rtnl_unlock();
1447         return err;
1448
1449 rollback:
1450         last = dev;
1451         for_each_net(net) {
1452                 for_each_netdev(net, dev) {
1453                         if (dev == last)
1454                                 goto outroll;
1455
1456                         if (dev->flags & IFF_UP) {
1457                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1458                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1459                         }
1460                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1461                 }
1462         }
1463
1464 outroll:
1465         raw_notifier_chain_unregister(&netdev_chain, nb);
1466         goto unlock;
1467 }
1468 EXPORT_SYMBOL(register_netdevice_notifier);
1469
1470 /**
1471  *      unregister_netdevice_notifier - unregister a network notifier block
1472  *      @nb: notifier
1473  *
1474  *      Unregister a notifier previously registered by
1475  *      register_netdevice_notifier(). The notifier is unlinked into the
1476  *      kernel structures and may then be reused. A negative errno code
1477  *      is returned on a failure.
1478  *
1479  *      After unregistering unregister and down device events are synthesized
1480  *      for all devices on the device list to the removed notifier to remove
1481  *      the need for special case cleanup code.
1482  */
1483
1484 int unregister_netdevice_notifier(struct notifier_block *nb)
1485 {
1486         struct net_device *dev;
1487         struct net *net;
1488         int err;
1489
1490         rtnl_lock();
1491         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1492         if (err)
1493                 goto unlock;
1494
1495         for_each_net(net) {
1496                 for_each_netdev(net, dev) {
1497                         if (dev->flags & IFF_UP) {
1498                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1499                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1500                         }
1501                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1502                 }
1503         }
1504 unlock:
1505         rtnl_unlock();
1506         return err;
1507 }
1508 EXPORT_SYMBOL(unregister_netdevice_notifier);
1509
1510 /**
1511  *      call_netdevice_notifiers - call all network notifier blocks
1512  *      @val: value passed unmodified to notifier function
1513  *      @dev: net_device pointer passed unmodified to notifier function
1514  *
1515  *      Call all network notifier blocks.  Parameters and return value
1516  *      are as for raw_notifier_call_chain().
1517  */
1518
1519 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1520 {
1521         ASSERT_RTNL();
1522         return raw_notifier_call_chain(&netdev_chain, val, dev);
1523 }
1524 EXPORT_SYMBOL(call_netdevice_notifiers);
1525
1526 static struct static_key netstamp_needed __read_mostly;
1527 #ifdef HAVE_JUMP_LABEL
1528 /* We are not allowed to call static_key_slow_dec() from irq context
1529  * If net_disable_timestamp() is called from irq context, defer the
1530  * static_key_slow_dec() calls.
1531  */
1532 static atomic_t netstamp_needed_deferred;
1533 #endif
1534
1535 void net_enable_timestamp(void)
1536 {
1537 #ifdef HAVE_JUMP_LABEL
1538         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1539
1540         if (deferred) {
1541                 while (--deferred)
1542                         static_key_slow_dec(&netstamp_needed);
1543                 return;
1544         }
1545 #endif
1546         static_key_slow_inc(&netstamp_needed);
1547 }
1548 EXPORT_SYMBOL(net_enable_timestamp);
1549
1550 void net_disable_timestamp(void)
1551 {
1552 #ifdef HAVE_JUMP_LABEL
1553         if (in_interrupt()) {
1554                 atomic_inc(&netstamp_needed_deferred);
1555                 return;
1556         }
1557 #endif
1558         static_key_slow_dec(&netstamp_needed);
1559 }
1560 EXPORT_SYMBOL(net_disable_timestamp);
1561
1562 static inline void net_timestamp_set(struct sk_buff *skb)
1563 {
1564         skb->tstamp.tv64 = 0;
1565         if (static_key_false(&netstamp_needed))
1566                 __net_timestamp(skb);
1567 }
1568
1569 #define net_timestamp_check(COND, SKB)                  \
1570         if (static_key_false(&netstamp_needed)) {               \
1571                 if ((COND) && !(SKB)->tstamp.tv64)      \
1572                         __net_timestamp(SKB);           \
1573         }                                               \
1574
1575 static inline bool is_skb_forwardable(struct net_device *dev,
1576                                       struct sk_buff *skb)
1577 {
1578         unsigned int len;
1579
1580         if (!(dev->flags & IFF_UP))
1581                 return false;
1582
1583         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1584         if (skb->len <= len)
1585                 return true;
1586
1587         /* if TSO is enabled, we don't care about the length as the packet
1588          * could be forwarded without being segmented before
1589          */
1590         if (skb_is_gso(skb))
1591                 return true;
1592
1593         return false;
1594 }
1595
1596 /**
1597  * dev_forward_skb - loopback an skb to another netif
1598  *
1599  * @dev: destination network device
1600  * @skb: buffer to forward
1601  *
1602  * return values:
1603  *      NET_RX_SUCCESS  (no congestion)
1604  *      NET_RX_DROP     (packet was dropped, but freed)
1605  *
1606  * dev_forward_skb can be used for injecting an skb from the
1607  * start_xmit function of one device into the receive queue
1608  * of another device.
1609  *
1610  * The receiving device may be in another namespace, so
1611  * we have to clear all information in the skb that could
1612  * impact namespace isolation.
1613  */
1614 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1615 {
1616         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1617                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1618                         atomic_long_inc(&dev->rx_dropped);
1619                         kfree_skb(skb);
1620                         return NET_RX_DROP;
1621                 }
1622         }
1623
1624         skb_orphan(skb);
1625
1626         if (unlikely(!is_skb_forwardable(dev, skb))) {
1627                 atomic_long_inc(&dev->rx_dropped);
1628                 kfree_skb(skb);
1629                 return NET_RX_DROP;
1630         }
1631         skb->skb_iif = 0;
1632         skb->dev = dev;
1633         skb_dst_drop(skb);
1634         skb->tstamp.tv64 = 0;
1635         skb->pkt_type = PACKET_HOST;
1636         skb->protocol = eth_type_trans(skb, dev);
1637         skb->mark = 0;
1638         secpath_reset(skb);
1639         nf_reset(skb);
1640         nf_reset_trace(skb);
1641         return netif_rx(skb);
1642 }
1643 EXPORT_SYMBOL_GPL(dev_forward_skb);
1644
1645 static inline int deliver_skb(struct sk_buff *skb,
1646                               struct packet_type *pt_prev,
1647                               struct net_device *orig_dev)
1648 {
1649         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1650                 return -ENOMEM;
1651         atomic_inc(&skb->users);
1652         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1653 }
1654
1655 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1656 {
1657         if (!ptype->af_packet_priv || !skb->sk)
1658                 return false;
1659
1660         if (ptype->id_match)
1661                 return ptype->id_match(ptype, skb->sk);
1662         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1663                 return true;
1664
1665         return false;
1666 }
1667
1668 /*
1669  *      Support routine. Sends outgoing frames to any network
1670  *      taps currently in use.
1671  */
1672
1673 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1674 {
1675         struct packet_type *ptype;
1676         struct sk_buff *skb2 = NULL;
1677         struct packet_type *pt_prev = NULL;
1678
1679         rcu_read_lock();
1680         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1681                 /* Never send packets back to the socket
1682                  * they originated from - MvS (miquels@drinkel.ow.org)
1683                  */
1684                 if ((ptype->dev == dev || !ptype->dev) &&
1685                     (!skb_loop_sk(ptype, skb))) {
1686                         if (pt_prev) {
1687                                 deliver_skb(skb2, pt_prev, skb->dev);
1688                                 pt_prev = ptype;
1689                                 continue;
1690                         }
1691
1692                         skb2 = skb_clone(skb, GFP_ATOMIC);
1693                         if (!skb2)
1694                                 break;
1695
1696                         net_timestamp_set(skb2);
1697
1698                         /* skb->nh should be correctly
1699                            set by sender, so that the second statement is
1700                            just protection against buggy protocols.
1701                          */
1702                         skb_reset_mac_header(skb2);
1703
1704                         if (skb_network_header(skb2) < skb2->data ||
1705                             skb2->network_header > skb2->tail) {
1706                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1707                                                      ntohs(skb2->protocol),
1708                                                      dev->name);
1709                                 skb_reset_network_header(skb2);
1710                         }
1711
1712                         skb2->transport_header = skb2->network_header;
1713                         skb2->pkt_type = PACKET_OUTGOING;
1714                         pt_prev = ptype;
1715                 }
1716         }
1717         if (pt_prev)
1718                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1719         rcu_read_unlock();
1720 }
1721
1722 /**
1723  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1724  * @dev: Network device
1725  * @txq: number of queues available
1726  *
1727  * If real_num_tx_queues is changed the tc mappings may no longer be
1728  * valid. To resolve this verify the tc mapping remains valid and if
1729  * not NULL the mapping. With no priorities mapping to this
1730  * offset/count pair it will no longer be used. In the worst case TC0
1731  * is invalid nothing can be done so disable priority mappings. If is
1732  * expected that drivers will fix this mapping if they can before
1733  * calling netif_set_real_num_tx_queues.
1734  */
1735 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1736 {
1737         int i;
1738         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1739
1740         /* If TC0 is invalidated disable TC mapping */
1741         if (tc->offset + tc->count > txq) {
1742                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1743                 dev->num_tc = 0;
1744                 return;
1745         }
1746
1747         /* Invalidated prio to tc mappings set to TC0 */
1748         for (i = 1; i < TC_BITMASK + 1; i++) {
1749                 int q = netdev_get_prio_tc_map(dev, i);
1750
1751                 tc = &dev->tc_to_txq[q];
1752                 if (tc->offset + tc->count > txq) {
1753                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1754                                 i, q);
1755                         netdev_set_prio_tc_map(dev, i, 0);
1756                 }
1757         }
1758 }
1759
1760 #ifdef CONFIG_XPS
1761 static DEFINE_MUTEX(xps_map_mutex);
1762 #define xmap_dereference(P)             \
1763         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1764
1765 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1766                                         int cpu, u16 index)
1767 {
1768         struct xps_map *map = NULL;
1769         int pos;
1770
1771         if (dev_maps)
1772                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1773
1774         for (pos = 0; map && pos < map->len; pos++) {
1775                 if (map->queues[pos] == index) {
1776                         if (map->len > 1) {
1777                                 map->queues[pos] = map->queues[--map->len];
1778                         } else {
1779                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1780                                 kfree_rcu(map, rcu);
1781                                 map = NULL;
1782                         }
1783                         break;
1784                 }
1785         }
1786
1787         return map;
1788 }
1789
1790 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1791 {
1792         struct xps_dev_maps *dev_maps;
1793         int cpu, i;
1794         bool active = false;
1795
1796         mutex_lock(&xps_map_mutex);
1797         dev_maps = xmap_dereference(dev->xps_maps);
1798
1799         if (!dev_maps)
1800                 goto out_no_maps;
1801
1802         for_each_possible_cpu(cpu) {
1803                 for (i = index; i < dev->num_tx_queues; i++) {
1804                         if (!remove_xps_queue(dev_maps, cpu, i))
1805                                 break;
1806                 }
1807                 if (i == dev->num_tx_queues)
1808                         active = true;
1809         }
1810
1811         if (!active) {
1812                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1813                 kfree_rcu(dev_maps, rcu);
1814         }
1815
1816         for (i = index; i < dev->num_tx_queues; i++)
1817                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1818                                              NUMA_NO_NODE);
1819
1820 out_no_maps:
1821         mutex_unlock(&xps_map_mutex);
1822 }
1823
1824 static struct xps_map *expand_xps_map(struct xps_map *map,
1825                                       int cpu, u16 index)
1826 {
1827         struct xps_map *new_map;
1828         int alloc_len = XPS_MIN_MAP_ALLOC;
1829         int i, pos;
1830
1831         for (pos = 0; map && pos < map->len; pos++) {
1832                 if (map->queues[pos] != index)
1833                         continue;
1834                 return map;
1835         }
1836
1837         /* Need to add queue to this CPU's existing map */
1838         if (map) {
1839                 if (pos < map->alloc_len)
1840                         return map;
1841
1842                 alloc_len = map->alloc_len * 2;
1843         }
1844
1845         /* Need to allocate new map to store queue on this CPU's map */
1846         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1847                                cpu_to_node(cpu));
1848         if (!new_map)
1849                 return NULL;
1850
1851         for (i = 0; i < pos; i++)
1852                 new_map->queues[i] = map->queues[i];
1853         new_map->alloc_len = alloc_len;
1854         new_map->len = pos;
1855
1856         return new_map;
1857 }
1858
1859 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1860 {
1861         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1862         struct xps_map *map, *new_map;
1863         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1864         int cpu, numa_node_id = -2;
1865         bool active = false;
1866
1867         mutex_lock(&xps_map_mutex);
1868
1869         dev_maps = xmap_dereference(dev->xps_maps);
1870
1871         /* allocate memory for queue storage */
1872         for_each_online_cpu(cpu) {
1873                 if (!cpumask_test_cpu(cpu, mask))
1874                         continue;
1875
1876                 if (!new_dev_maps)
1877                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1878                 if (!new_dev_maps) {
1879                         mutex_unlock(&xps_map_mutex);
1880                         return -ENOMEM;
1881                 }
1882
1883                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1884                                  NULL;
1885
1886                 map = expand_xps_map(map, cpu, index);
1887                 if (!map)
1888                         goto error;
1889
1890                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1891         }
1892
1893         if (!new_dev_maps)
1894                 goto out_no_new_maps;
1895
1896         for_each_possible_cpu(cpu) {
1897                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1898                         /* add queue to CPU maps */
1899                         int pos = 0;
1900
1901                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1902                         while ((pos < map->len) && (map->queues[pos] != index))
1903                                 pos++;
1904
1905                         if (pos == map->len)
1906                                 map->queues[map->len++] = index;
1907 #ifdef CONFIG_NUMA
1908                         if (numa_node_id == -2)
1909                                 numa_node_id = cpu_to_node(cpu);
1910                         else if (numa_node_id != cpu_to_node(cpu))
1911                                 numa_node_id = -1;
1912 #endif
1913                 } else if (dev_maps) {
1914                         /* fill in the new device map from the old device map */
1915                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1916                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1917                 }
1918
1919         }
1920
1921         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1922
1923         /* Cleanup old maps */
1924         if (dev_maps) {
1925                 for_each_possible_cpu(cpu) {
1926                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1927                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1928                         if (map && map != new_map)
1929                                 kfree_rcu(map, rcu);
1930                 }
1931
1932                 kfree_rcu(dev_maps, rcu);
1933         }
1934
1935         dev_maps = new_dev_maps;
1936         active = true;
1937
1938 out_no_new_maps:
1939         /* update Tx queue numa node */
1940         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1941                                      (numa_node_id >= 0) ? numa_node_id :
1942                                      NUMA_NO_NODE);
1943
1944         if (!dev_maps)
1945                 goto out_no_maps;
1946
1947         /* removes queue from unused CPUs */
1948         for_each_possible_cpu(cpu) {
1949                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1950                         continue;
1951
1952                 if (remove_xps_queue(dev_maps, cpu, index))
1953                         active = true;
1954         }
1955
1956         /* free map if not active */
1957         if (!active) {
1958                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1959                 kfree_rcu(dev_maps, rcu);
1960         }
1961
1962 out_no_maps:
1963         mutex_unlock(&xps_map_mutex);
1964
1965         return 0;
1966 error:
1967         /* remove any maps that we added */
1968         for_each_possible_cpu(cpu) {
1969                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1970                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1971                                  NULL;
1972                 if (new_map && new_map != map)
1973                         kfree(new_map);
1974         }
1975
1976         mutex_unlock(&xps_map_mutex);
1977
1978         kfree(new_dev_maps);
1979         return -ENOMEM;
1980 }
1981 EXPORT_SYMBOL(netif_set_xps_queue);
1982
1983 #endif
1984 /*
1985  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1986  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1987  */
1988 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1989 {
1990         int rc;
1991
1992         if (txq < 1 || txq > dev->num_tx_queues)
1993                 return -EINVAL;
1994
1995         if (dev->reg_state == NETREG_REGISTERED ||
1996             dev->reg_state == NETREG_UNREGISTERING) {
1997                 ASSERT_RTNL();
1998
1999                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2000                                                   txq);
2001                 if (rc)
2002                         return rc;
2003
2004                 if (dev->num_tc)
2005                         netif_setup_tc(dev, txq);
2006
2007                 if (txq < dev->real_num_tx_queues) {
2008                         qdisc_reset_all_tx_gt(dev, txq);
2009 #ifdef CONFIG_XPS
2010                         netif_reset_xps_queues_gt(dev, txq);
2011 #endif
2012                 }
2013         }
2014
2015         dev->real_num_tx_queues = txq;
2016         return 0;
2017 }
2018 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2019
2020 #ifdef CONFIG_RPS
2021 /**
2022  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2023  *      @dev: Network device
2024  *      @rxq: Actual number of RX queues
2025  *
2026  *      This must be called either with the rtnl_lock held or before
2027  *      registration of the net device.  Returns 0 on success, or a
2028  *      negative error code.  If called before registration, it always
2029  *      succeeds.
2030  */
2031 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2032 {
2033         int rc;
2034
2035         if (rxq < 1 || rxq > dev->num_rx_queues)
2036                 return -EINVAL;
2037
2038         if (dev->reg_state == NETREG_REGISTERED) {
2039                 ASSERT_RTNL();
2040
2041                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2042                                                   rxq);
2043                 if (rc)
2044                         return rc;
2045         }
2046
2047         dev->real_num_rx_queues = rxq;
2048         return 0;
2049 }
2050 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2051 #endif
2052
2053 /**
2054  * netif_get_num_default_rss_queues - default number of RSS queues
2055  *
2056  * This routine should set an upper limit on the number of RSS queues
2057  * used by default by multiqueue devices.
2058  */
2059 int netif_get_num_default_rss_queues(void)
2060 {
2061         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2062 }
2063 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2064
2065 static inline void __netif_reschedule(struct Qdisc *q)
2066 {
2067         struct softnet_data *sd;
2068         unsigned long flags;
2069
2070         local_irq_save(flags);
2071         sd = &__get_cpu_var(softnet_data);
2072         q->next_sched = NULL;
2073         *sd->output_queue_tailp = q;
2074         sd->output_queue_tailp = &q->next_sched;
2075         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2076         local_irq_restore(flags);
2077 }
2078
2079 void __netif_schedule(struct Qdisc *q)
2080 {
2081         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2082                 __netif_reschedule(q);
2083 }
2084 EXPORT_SYMBOL(__netif_schedule);
2085
2086 void dev_kfree_skb_irq(struct sk_buff *skb)
2087 {
2088         if (atomic_dec_and_test(&skb->users)) {
2089                 struct softnet_data *sd;
2090                 unsigned long flags;
2091
2092                 local_irq_save(flags);
2093                 sd = &__get_cpu_var(softnet_data);
2094                 skb->next = sd->completion_queue;
2095                 sd->completion_queue = skb;
2096                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2097                 local_irq_restore(flags);
2098         }
2099 }
2100 EXPORT_SYMBOL(dev_kfree_skb_irq);
2101
2102 void dev_kfree_skb_any(struct sk_buff *skb)
2103 {
2104         if (in_irq() || irqs_disabled())
2105                 dev_kfree_skb_irq(skb);
2106         else
2107                 dev_kfree_skb(skb);
2108 }
2109 EXPORT_SYMBOL(dev_kfree_skb_any);
2110
2111
2112 /**
2113  * netif_device_detach - mark device as removed
2114  * @dev: network device
2115  *
2116  * Mark device as removed from system and therefore no longer available.
2117  */
2118 void netif_device_detach(struct net_device *dev)
2119 {
2120         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2121             netif_running(dev)) {
2122                 netif_tx_stop_all_queues(dev);
2123         }
2124 }
2125 EXPORT_SYMBOL(netif_device_detach);
2126
2127 /**
2128  * netif_device_attach - mark device as attached
2129  * @dev: network device
2130  *
2131  * Mark device as attached from system and restart if needed.
2132  */
2133 void netif_device_attach(struct net_device *dev)
2134 {
2135         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2136             netif_running(dev)) {
2137                 netif_tx_wake_all_queues(dev);
2138                 __netdev_watchdog_up(dev);
2139         }
2140 }
2141 EXPORT_SYMBOL(netif_device_attach);
2142
2143 static void skb_warn_bad_offload(const struct sk_buff *skb)
2144 {
2145         static const netdev_features_t null_features = 0;
2146         struct net_device *dev = skb->dev;
2147         const char *driver = "";
2148
2149         if (!net_ratelimit())
2150                 return;
2151
2152         if (dev && dev->dev.parent)
2153                 driver = dev_driver_string(dev->dev.parent);
2154
2155         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2156              "gso_type=%d ip_summed=%d\n",
2157              driver, dev ? &dev->features : &null_features,
2158              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2159              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2160              skb_shinfo(skb)->gso_type, skb->ip_summed);
2161 }
2162
2163 /*
2164  * Invalidate hardware checksum when packet is to be mangled, and
2165  * complete checksum manually on outgoing path.
2166  */
2167 int skb_checksum_help(struct sk_buff *skb)
2168 {
2169         __wsum csum;
2170         int ret = 0, offset;
2171
2172         if (skb->ip_summed == CHECKSUM_COMPLETE)
2173                 goto out_set_summed;
2174
2175         if (unlikely(skb_shinfo(skb)->gso_size)) {
2176                 skb_warn_bad_offload(skb);
2177                 return -EINVAL;
2178         }
2179
2180         /* Before computing a checksum, we should make sure no frag could
2181          * be modified by an external entity : checksum could be wrong.
2182          */
2183         if (skb_has_shared_frag(skb)) {
2184                 ret = __skb_linearize(skb);
2185                 if (ret)
2186                         goto out;
2187         }
2188
2189         offset = skb_checksum_start_offset(skb);
2190         BUG_ON(offset >= skb_headlen(skb));
2191         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2192
2193         offset += skb->csum_offset;
2194         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2195
2196         if (skb_cloned(skb) &&
2197             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2198                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2199                 if (ret)
2200                         goto out;
2201         }
2202
2203         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2204 out_set_summed:
2205         skb->ip_summed = CHECKSUM_NONE;
2206 out:
2207         return ret;
2208 }
2209 EXPORT_SYMBOL(skb_checksum_help);
2210
2211 __be16 skb_network_protocol(struct sk_buff *skb)
2212 {
2213         __be16 type = skb->protocol;
2214         int vlan_depth = ETH_HLEN;
2215
2216         /* Tunnel gso handlers can set protocol to ethernet. */
2217         if (type == htons(ETH_P_TEB)) {
2218                 struct ethhdr *eth;
2219
2220                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2221                         return 0;
2222
2223                 eth = (struct ethhdr *)skb_mac_header(skb);
2224                 type = eth->h_proto;
2225         }
2226
2227         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2228                 struct vlan_hdr *vh;
2229
2230                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2231                         return 0;
2232
2233                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2234                 type = vh->h_vlan_encapsulated_proto;
2235                 vlan_depth += VLAN_HLEN;
2236         }
2237
2238         return type;
2239 }
2240
2241 /**
2242  *      skb_mac_gso_segment - mac layer segmentation handler.
2243  *      @skb: buffer to segment
2244  *      @features: features for the output path (see dev->features)
2245  */
2246 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2247                                     netdev_features_t features)
2248 {
2249         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2250         struct packet_offload *ptype;
2251         __be16 type = skb_network_protocol(skb);
2252
2253         if (unlikely(!type))
2254                 return ERR_PTR(-EINVAL);
2255
2256         __skb_pull(skb, skb->mac_len);
2257
2258         rcu_read_lock();
2259         list_for_each_entry_rcu(ptype, &offload_base, list) {
2260                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2261                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2262                                 int err;
2263
2264                                 err = ptype->callbacks.gso_send_check(skb);
2265                                 segs = ERR_PTR(err);
2266                                 if (err || skb_gso_ok(skb, features))
2267                                         break;
2268                                 __skb_push(skb, (skb->data -
2269                                                  skb_network_header(skb)));
2270                         }
2271                         segs = ptype->callbacks.gso_segment(skb, features);
2272                         break;
2273                 }
2274         }
2275         rcu_read_unlock();
2276
2277         __skb_push(skb, skb->data - skb_mac_header(skb));
2278
2279         return segs;
2280 }
2281 EXPORT_SYMBOL(skb_mac_gso_segment);
2282
2283
2284 /* openvswitch calls this on rx path, so we need a different check.
2285  */
2286 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2287 {
2288         if (tx_path)
2289                 return skb->ip_summed != CHECKSUM_PARTIAL;
2290         else
2291                 return skb->ip_summed == CHECKSUM_NONE;
2292 }
2293
2294 /**
2295  *      __skb_gso_segment - Perform segmentation on skb.
2296  *      @skb: buffer to segment
2297  *      @features: features for the output path (see dev->features)
2298  *      @tx_path: whether it is called in TX path
2299  *
2300  *      This function segments the given skb and returns a list of segments.
2301  *
2302  *      It may return NULL if the skb requires no segmentation.  This is
2303  *      only possible when GSO is used for verifying header integrity.
2304  */
2305 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2306                                   netdev_features_t features, bool tx_path)
2307 {
2308         if (unlikely(skb_needs_check(skb, tx_path))) {
2309                 int err;
2310
2311                 skb_warn_bad_offload(skb);
2312
2313                 if (skb_header_cloned(skb) &&
2314                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2315                         return ERR_PTR(err);
2316         }
2317
2318         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2319         skb_reset_mac_header(skb);
2320         skb_reset_mac_len(skb);
2321
2322         return skb_mac_gso_segment(skb, features);
2323 }
2324 EXPORT_SYMBOL(__skb_gso_segment);
2325
2326 /* Take action when hardware reception checksum errors are detected. */
2327 #ifdef CONFIG_BUG
2328 void netdev_rx_csum_fault(struct net_device *dev)
2329 {
2330         if (net_ratelimit()) {
2331                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2332                 dump_stack();
2333         }
2334 }
2335 EXPORT_SYMBOL(netdev_rx_csum_fault);
2336 #endif
2337
2338 /* Actually, we should eliminate this check as soon as we know, that:
2339  * 1. IOMMU is present and allows to map all the memory.
2340  * 2. No high memory really exists on this machine.
2341  */
2342
2343 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2344 {
2345 #ifdef CONFIG_HIGHMEM
2346         int i;
2347         if (!(dev->features & NETIF_F_HIGHDMA)) {
2348                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2349                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2350                         if (PageHighMem(skb_frag_page(frag)))
2351                                 return 1;
2352                 }
2353         }
2354
2355         if (PCI_DMA_BUS_IS_PHYS) {
2356                 struct device *pdev = dev->dev.parent;
2357
2358                 if (!pdev)
2359                         return 0;
2360                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2361                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2362                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2363                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2364                                 return 1;
2365                 }
2366         }
2367 #endif
2368         return 0;
2369 }
2370
2371 struct dev_gso_cb {
2372         void (*destructor)(struct sk_buff *skb);
2373 };
2374
2375 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2376
2377 static void dev_gso_skb_destructor(struct sk_buff *skb)
2378 {
2379         struct dev_gso_cb *cb;
2380
2381         do {
2382                 struct sk_buff *nskb = skb->next;
2383
2384                 skb->next = nskb->next;
2385                 nskb->next = NULL;
2386                 kfree_skb(nskb);
2387         } while (skb->next);
2388
2389         cb = DEV_GSO_CB(skb);
2390         if (cb->destructor)
2391                 cb->destructor(skb);
2392 }
2393
2394 /**
2395  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2396  *      @skb: buffer to segment
2397  *      @features: device features as applicable to this skb
2398  *
2399  *      This function segments the given skb and stores the list of segments
2400  *      in skb->next.
2401  */
2402 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2403 {
2404         struct sk_buff *segs;
2405
2406         segs = skb_gso_segment(skb, features);
2407
2408         /* Verifying header integrity only. */
2409         if (!segs)
2410                 return 0;
2411
2412         if (IS_ERR(segs))
2413                 return PTR_ERR(segs);
2414
2415         skb->next = segs;
2416         DEV_GSO_CB(skb)->destructor = skb->destructor;
2417         skb->destructor = dev_gso_skb_destructor;
2418
2419         return 0;
2420 }
2421
2422 static netdev_features_t harmonize_features(struct sk_buff *skb,
2423         __be16 protocol, netdev_features_t features)
2424 {
2425         if (skb->ip_summed != CHECKSUM_NONE &&
2426             !can_checksum_protocol(features, protocol)) {
2427                 features &= ~NETIF_F_ALL_CSUM;
2428         } else if (illegal_highdma(skb->dev, skb)) {
2429                 features &= ~NETIF_F_SG;
2430         }
2431
2432         return features;
2433 }
2434
2435 netdev_features_t netif_skb_features(struct sk_buff *skb)
2436 {
2437         __be16 protocol = skb->protocol;
2438         netdev_features_t features = skb->dev->features;
2439
2440         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2441                 features &= ~NETIF_F_GSO_MASK;
2442
2443         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2444                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2445                 protocol = veh->h_vlan_encapsulated_proto;
2446         } else if (!vlan_tx_tag_present(skb)) {
2447                 return harmonize_features(skb, protocol, features);
2448         }
2449
2450         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2451                                                NETIF_F_HW_VLAN_STAG_TX);
2452
2453         if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
2454                 return harmonize_features(skb, protocol, features);
2455         } else {
2456                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2457                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2458                                 NETIF_F_HW_VLAN_STAG_TX;
2459                 return harmonize_features(skb, protocol, features);
2460         }
2461 }
2462 EXPORT_SYMBOL(netif_skb_features);
2463
2464 /*
2465  * Returns true if either:
2466  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2467  *      2. skb is fragmented and the device does not support SG.
2468  */
2469 static inline int skb_needs_linearize(struct sk_buff *skb,
2470                                       netdev_features_t features)
2471 {
2472         return skb_is_nonlinear(skb) &&
2473                         ((skb_has_frag_list(skb) &&
2474                                 !(features & NETIF_F_FRAGLIST)) ||
2475                         (skb_shinfo(skb)->nr_frags &&
2476                                 !(features & NETIF_F_SG)));
2477 }
2478
2479 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2480                         struct netdev_queue *txq)
2481 {
2482         const struct net_device_ops *ops = dev->netdev_ops;
2483         int rc = NETDEV_TX_OK;
2484         unsigned int skb_len;
2485
2486         if (likely(!skb->next)) {
2487                 netdev_features_t features;
2488
2489                 /*
2490                  * If device doesn't need skb->dst, release it right now while
2491                  * its hot in this cpu cache
2492                  */
2493                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2494                         skb_dst_drop(skb);
2495
2496                 features = netif_skb_features(skb);
2497
2498                 if (vlan_tx_tag_present(skb) &&
2499                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2500                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2501                                              vlan_tx_tag_get(skb));
2502                         if (unlikely(!skb))
2503                                 goto out;
2504
2505                         skb->vlan_tci = 0;
2506                 }
2507
2508                 /* If encapsulation offload request, verify we are testing
2509                  * hardware encapsulation features instead of standard
2510                  * features for the netdev
2511                  */
2512                 if (skb->encapsulation)
2513                         features &= dev->hw_enc_features;
2514
2515                 if (netif_needs_gso(skb, features)) {
2516                         if (unlikely(dev_gso_segment(skb, features)))
2517                                 goto out_kfree_skb;
2518                         if (skb->next)
2519                                 goto gso;
2520                 } else {
2521                         if (skb_needs_linearize(skb, features) &&
2522                             __skb_linearize(skb))
2523                                 goto out_kfree_skb;
2524
2525                         /* If packet is not checksummed and device does not
2526                          * support checksumming for this protocol, complete
2527                          * checksumming here.
2528                          */
2529                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2530                                 if (skb->encapsulation)
2531                                         skb_set_inner_transport_header(skb,
2532                                                 skb_checksum_start_offset(skb));
2533                                 else
2534                                         skb_set_transport_header(skb,
2535                                                 skb_checksum_start_offset(skb));
2536                                 if (!(features & NETIF_F_ALL_CSUM) &&
2537                                      skb_checksum_help(skb))
2538                                         goto out_kfree_skb;
2539                         }
2540                 }
2541
2542                 if (!list_empty(&ptype_all))
2543                         dev_queue_xmit_nit(skb, dev);
2544
2545                 skb_len = skb->len;
2546                 rc = ops->ndo_start_xmit(skb, dev);
2547                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2548                 if (rc == NETDEV_TX_OK)
2549                         txq_trans_update(txq);
2550                 return rc;
2551         }
2552
2553 gso:
2554         do {
2555                 struct sk_buff *nskb = skb->next;
2556
2557                 skb->next = nskb->next;
2558                 nskb->next = NULL;
2559
2560                 if (!list_empty(&ptype_all))
2561                         dev_queue_xmit_nit(nskb, dev);
2562
2563                 skb_len = nskb->len;
2564                 rc = ops->ndo_start_xmit(nskb, dev);
2565                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2566                 if (unlikely(rc != NETDEV_TX_OK)) {
2567                         if (rc & ~NETDEV_TX_MASK)
2568                                 goto out_kfree_gso_skb;
2569                         nskb->next = skb->next;
2570                         skb->next = nskb;
2571                         return rc;
2572                 }
2573                 txq_trans_update(txq);
2574                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2575                         return NETDEV_TX_BUSY;
2576         } while (skb->next);
2577
2578 out_kfree_gso_skb:
2579         if (likely(skb->next == NULL)) {
2580                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2581                 consume_skb(skb);
2582                 return rc;
2583         }
2584 out_kfree_skb:
2585         kfree_skb(skb);
2586 out:
2587         return rc;
2588 }
2589
2590 static void qdisc_pkt_len_init(struct sk_buff *skb)
2591 {
2592         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2593
2594         qdisc_skb_cb(skb)->pkt_len = skb->len;
2595
2596         /* To get more precise estimation of bytes sent on wire,
2597          * we add to pkt_len the headers size of all segments
2598          */
2599         if (shinfo->gso_size)  {
2600                 unsigned int hdr_len;
2601                 u16 gso_segs = shinfo->gso_segs;
2602
2603                 /* mac layer + network layer */
2604                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2605
2606                 /* + transport layer */
2607                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2608                         hdr_len += tcp_hdrlen(skb);
2609                 else
2610                         hdr_len += sizeof(struct udphdr);
2611
2612                 if (shinfo->gso_type & SKB_GSO_DODGY)
2613                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2614                                                 shinfo->gso_size);
2615
2616                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2617         }
2618 }
2619
2620 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2621                                  struct net_device *dev,
2622                                  struct netdev_queue *txq)
2623 {
2624         spinlock_t *root_lock = qdisc_lock(q);
2625         bool contended;
2626         int rc;
2627
2628         qdisc_pkt_len_init(skb);
2629         qdisc_calculate_pkt_len(skb, q);
2630         /*
2631          * Heuristic to force contended enqueues to serialize on a
2632          * separate lock before trying to get qdisc main lock.
2633          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2634          * and dequeue packets faster.
2635          */
2636         contended = qdisc_is_running(q);
2637         if (unlikely(contended))
2638                 spin_lock(&q->busylock);
2639
2640         spin_lock(root_lock);
2641         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2642                 kfree_skb(skb);
2643                 rc = NET_XMIT_DROP;
2644         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2645                    qdisc_run_begin(q)) {
2646                 /*
2647                  * This is a work-conserving queue; there are no old skbs
2648                  * waiting to be sent out; and the qdisc is not running -
2649                  * xmit the skb directly.
2650                  */
2651                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2652                         skb_dst_force(skb);
2653
2654                 qdisc_bstats_update(q, skb);
2655
2656                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2657                         if (unlikely(contended)) {
2658                                 spin_unlock(&q->busylock);
2659                                 contended = false;
2660                         }
2661                         __qdisc_run(q);
2662                 } else
2663                         qdisc_run_end(q);
2664
2665                 rc = NET_XMIT_SUCCESS;
2666         } else {
2667                 skb_dst_force(skb);
2668                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2669                 if (qdisc_run_begin(q)) {
2670                         if (unlikely(contended)) {
2671                                 spin_unlock(&q->busylock);
2672                                 contended = false;
2673                         }
2674                         __qdisc_run(q);
2675                 }
2676         }
2677         spin_unlock(root_lock);
2678         if (unlikely(contended))
2679                 spin_unlock(&q->busylock);
2680         return rc;
2681 }
2682
2683 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2684 static void skb_update_prio(struct sk_buff *skb)
2685 {
2686         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2687
2688         if (!skb->priority && skb->sk && map) {
2689                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2690
2691                 if (prioidx < map->priomap_len)
2692                         skb->priority = map->priomap[prioidx];
2693         }
2694 }
2695 #else
2696 #define skb_update_prio(skb)
2697 #endif
2698
2699 static DEFINE_PER_CPU(int, xmit_recursion);
2700 #define RECURSION_LIMIT 10
2701
2702 /**
2703  *      dev_loopback_xmit - loop back @skb
2704  *      @skb: buffer to transmit
2705  */
2706 int dev_loopback_xmit(struct sk_buff *skb)
2707 {
2708         skb_reset_mac_header(skb);
2709         __skb_pull(skb, skb_network_offset(skb));
2710         skb->pkt_type = PACKET_LOOPBACK;
2711         skb->ip_summed = CHECKSUM_UNNECESSARY;
2712         WARN_ON(!skb_dst(skb));
2713         skb_dst_force(skb);
2714         netif_rx_ni(skb);
2715         return 0;
2716 }
2717 EXPORT_SYMBOL(dev_loopback_xmit);
2718
2719 /**
2720  *      dev_queue_xmit - transmit a buffer
2721  *      @skb: buffer to transmit
2722  *
2723  *      Queue a buffer for transmission to a network device. The caller must
2724  *      have set the device and priority and built the buffer before calling
2725  *      this function. The function can be called from an interrupt.
2726  *
2727  *      A negative errno code is returned on a failure. A success does not
2728  *      guarantee the frame will be transmitted as it may be dropped due
2729  *      to congestion or traffic shaping.
2730  *
2731  * -----------------------------------------------------------------------------------
2732  *      I notice this method can also return errors from the queue disciplines,
2733  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2734  *      be positive.
2735  *
2736  *      Regardless of the return value, the skb is consumed, so it is currently
2737  *      difficult to retry a send to this method.  (You can bump the ref count
2738  *      before sending to hold a reference for retry if you are careful.)
2739  *
2740  *      When calling this method, interrupts MUST be enabled.  This is because
2741  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2742  *          --BLG
2743  */
2744 int dev_queue_xmit(struct sk_buff *skb)
2745 {
2746         struct net_device *dev = skb->dev;
2747         struct netdev_queue *txq;
2748         struct Qdisc *q;
2749         int rc = -ENOMEM;
2750
2751         skb_reset_mac_header(skb);
2752
2753         /* Disable soft irqs for various locks below. Also
2754          * stops preemption for RCU.
2755          */
2756         rcu_read_lock_bh();
2757
2758         skb_update_prio(skb);
2759
2760         txq = netdev_pick_tx(dev, skb);
2761         q = rcu_dereference_bh(txq->qdisc);
2762
2763 #ifdef CONFIG_NET_CLS_ACT
2764         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2765 #endif
2766         trace_net_dev_queue(skb);
2767         if (q->enqueue) {
2768                 rc = __dev_xmit_skb(skb, q, dev, txq);
2769                 goto out;
2770         }
2771
2772         /* The device has no queue. Common case for software devices:
2773            loopback, all the sorts of tunnels...
2774
2775            Really, it is unlikely that netif_tx_lock protection is necessary
2776            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2777            counters.)
2778            However, it is possible, that they rely on protection
2779            made by us here.
2780
2781            Check this and shot the lock. It is not prone from deadlocks.
2782            Either shot noqueue qdisc, it is even simpler 8)
2783          */
2784         if (dev->flags & IFF_UP) {
2785                 int cpu = smp_processor_id(); /* ok because BHs are off */
2786
2787                 if (txq->xmit_lock_owner != cpu) {
2788
2789                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2790                                 goto recursion_alert;
2791
2792                         HARD_TX_LOCK(dev, txq, cpu);
2793
2794                         if (!netif_xmit_stopped(txq)) {
2795                                 __this_cpu_inc(xmit_recursion);
2796                                 rc = dev_hard_start_xmit(skb, dev, txq);
2797                                 __this_cpu_dec(xmit_recursion);
2798                                 if (dev_xmit_complete(rc)) {
2799                                         HARD_TX_UNLOCK(dev, txq);
2800                                         goto out;
2801                                 }
2802                         }
2803                         HARD_TX_UNLOCK(dev, txq);
2804                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2805                                              dev->name);
2806                 } else {
2807                         /* Recursion is detected! It is possible,
2808                          * unfortunately
2809                          */
2810 recursion_alert:
2811                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2812                                              dev->name);
2813                 }
2814         }
2815
2816         rc = -ENETDOWN;
2817         rcu_read_unlock_bh();
2818
2819         kfree_skb(skb);
2820         return rc;
2821 out:
2822         rcu_read_unlock_bh();
2823         return rc;
2824 }
2825 EXPORT_SYMBOL(dev_queue_xmit);
2826
2827
2828 /*=======================================================================
2829                         Receiver routines
2830   =======================================================================*/
2831
2832 int netdev_max_backlog __read_mostly = 1000;
2833 EXPORT_SYMBOL(netdev_max_backlog);
2834
2835 int netdev_tstamp_prequeue __read_mostly = 1;
2836 int netdev_budget __read_mostly = 300;
2837 int weight_p __read_mostly = 64;            /* old backlog weight */
2838
2839 /* Called with irq disabled */
2840 static inline void ____napi_schedule(struct softnet_data *sd,
2841                                      struct napi_struct *napi)
2842 {
2843         list_add_tail(&napi->poll_list, &sd->poll_list);
2844         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2845 }
2846
2847 #ifdef CONFIG_RPS
2848
2849 /* One global table that all flow-based protocols share. */
2850 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2851 EXPORT_SYMBOL(rps_sock_flow_table);
2852
2853 struct static_key rps_needed __read_mostly;
2854
2855 static struct rps_dev_flow *
2856 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2857             struct rps_dev_flow *rflow, u16 next_cpu)
2858 {
2859         if (next_cpu != RPS_NO_CPU) {
2860 #ifdef CONFIG_RFS_ACCEL
2861                 struct netdev_rx_queue *rxqueue;
2862                 struct rps_dev_flow_table *flow_table;
2863                 struct rps_dev_flow *old_rflow;
2864                 u32 flow_id;
2865                 u16 rxq_index;
2866                 int rc;
2867
2868                 /* Should we steer this flow to a different hardware queue? */
2869                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2870                     !(dev->features & NETIF_F_NTUPLE))
2871                         goto out;
2872                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2873                 if (rxq_index == skb_get_rx_queue(skb))
2874                         goto out;
2875
2876                 rxqueue = dev->_rx + rxq_index;
2877                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2878                 if (!flow_table)
2879                         goto out;
2880                 flow_id = skb->rxhash & flow_table->mask;
2881                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2882                                                         rxq_index, flow_id);
2883                 if (rc < 0)
2884                         goto out;
2885                 old_rflow = rflow;
2886                 rflow = &flow_table->flows[flow_id];
2887                 rflow->filter = rc;
2888                 if (old_rflow->filter == rflow->filter)
2889                         old_rflow->filter = RPS_NO_FILTER;
2890         out:
2891 #endif
2892                 rflow->last_qtail =
2893                         per_cpu(softnet_data, next_cpu).input_queue_head;
2894         }
2895
2896         rflow->cpu = next_cpu;
2897         return rflow;
2898 }
2899
2900 /*
2901  * get_rps_cpu is called from netif_receive_skb and returns the target
2902  * CPU from the RPS map of the receiving queue for a given skb.
2903  * rcu_read_lock must be held on entry.
2904  */
2905 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2906                        struct rps_dev_flow **rflowp)
2907 {
2908         struct netdev_rx_queue *rxqueue;
2909         struct rps_map *map;
2910         struct rps_dev_flow_table *flow_table;
2911         struct rps_sock_flow_table *sock_flow_table;
2912         int cpu = -1;
2913         u16 tcpu;
2914
2915         if (skb_rx_queue_recorded(skb)) {
2916                 u16 index = skb_get_rx_queue(skb);
2917                 if (unlikely(index >= dev->real_num_rx_queues)) {
2918                         WARN_ONCE(dev->real_num_rx_queues > 1,
2919                                   "%s received packet on queue %u, but number "
2920                                   "of RX queues is %u\n",
2921                                   dev->name, index, dev->real_num_rx_queues);
2922                         goto done;
2923                 }
2924                 rxqueue = dev->_rx + index;
2925         } else
2926                 rxqueue = dev->_rx;
2927
2928         map = rcu_dereference(rxqueue->rps_map);
2929         if (map) {
2930                 if (map->len == 1 &&
2931                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2932                         tcpu = map->cpus[0];
2933                         if (cpu_online(tcpu))
2934                                 cpu = tcpu;
2935                         goto done;
2936                 }
2937         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2938                 goto done;
2939         }
2940
2941         skb_reset_network_header(skb);
2942         if (!skb_get_rxhash(skb))
2943                 goto done;
2944
2945         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2946         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2947         if (flow_table && sock_flow_table) {
2948                 u16 next_cpu;
2949                 struct rps_dev_flow *rflow;
2950
2951                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2952                 tcpu = rflow->cpu;
2953
2954                 next_cpu = sock_flow_table->ents[skb->rxhash &
2955                     sock_flow_table->mask];
2956
2957                 /*
2958                  * If the desired CPU (where last recvmsg was done) is
2959                  * different from current CPU (one in the rx-queue flow
2960                  * table entry), switch if one of the following holds:
2961                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2962                  *   - Current CPU is offline.
2963                  *   - The current CPU's queue tail has advanced beyond the
2964                  *     last packet that was enqueued using this table entry.
2965                  *     This guarantees that all previous packets for the flow
2966                  *     have been dequeued, thus preserving in order delivery.
2967                  */
2968                 if (unlikely(tcpu != next_cpu) &&
2969                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2970                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2971                       rflow->last_qtail)) >= 0)) {
2972                         tcpu = next_cpu;
2973                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2974                 }
2975
2976                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2977                         *rflowp = rflow;
2978                         cpu = tcpu;
2979                         goto done;
2980                 }
2981         }
2982
2983         if (map) {
2984                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2985
2986                 if (cpu_online(tcpu)) {
2987                         cpu = tcpu;
2988                         goto done;
2989                 }
2990         }
2991
2992 done:
2993         return cpu;
2994 }
2995
2996 #ifdef CONFIG_RFS_ACCEL
2997
2998 /**
2999  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3000  * @dev: Device on which the filter was set
3001  * @rxq_index: RX queue index
3002  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3003  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3004  *
3005  * Drivers that implement ndo_rx_flow_steer() should periodically call
3006  * this function for each installed filter and remove the filters for
3007  * which it returns %true.
3008  */
3009 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3010                          u32 flow_id, u16 filter_id)
3011 {
3012         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3013         struct rps_dev_flow_table *flow_table;
3014         struct rps_dev_flow *rflow;
3015         bool expire = true;
3016         int cpu;
3017
3018         rcu_read_lock();
3019         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3020         if (flow_table && flow_id <= flow_table->mask) {
3021                 rflow = &flow_table->flows[flow_id];
3022                 cpu = ACCESS_ONCE(rflow->cpu);
3023                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3024                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3025                            rflow->last_qtail) <
3026                      (int)(10 * flow_table->mask)))
3027                         expire = false;
3028         }
3029         rcu_read_unlock();
3030         return expire;
3031 }
3032 EXPORT_SYMBOL(rps_may_expire_flow);
3033
3034 #endif /* CONFIG_RFS_ACCEL */
3035
3036 /* Called from hardirq (IPI) context */
3037 static void rps_trigger_softirq(void *data)
3038 {
3039         struct softnet_data *sd = data;
3040
3041         ____napi_schedule(sd, &sd->backlog);
3042         sd->received_rps++;
3043 }
3044
3045 #endif /* CONFIG_RPS */
3046
3047 /*
3048  * Check if this softnet_data structure is another cpu one
3049  * If yes, queue it to our IPI list and return 1
3050  * If no, return 0
3051  */
3052 static int rps_ipi_queued(struct softnet_data *sd)
3053 {
3054 #ifdef CONFIG_RPS
3055         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3056
3057         if (sd != mysd) {
3058                 sd->rps_ipi_next = mysd->rps_ipi_list;
3059                 mysd->rps_ipi_list = sd;
3060
3061                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3062                 return 1;
3063         }
3064 #endif /* CONFIG_RPS */
3065         return 0;
3066 }
3067
3068 /*
3069  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3070  * queue (may be a remote CPU queue).
3071  */
3072 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3073                               unsigned int *qtail)
3074 {
3075         struct softnet_data *sd;
3076         unsigned long flags;
3077
3078         sd = &per_cpu(softnet_data, cpu);
3079
3080         local_irq_save(flags);
3081
3082         rps_lock(sd);
3083         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3084                 if (skb_queue_len(&sd->input_pkt_queue)) {
3085 enqueue:
3086                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3087                         input_queue_tail_incr_save(sd, qtail);
3088                         rps_unlock(sd);
3089                         local_irq_restore(flags);
3090                         return NET_RX_SUCCESS;
3091                 }
3092
3093                 /* Schedule NAPI for backlog device
3094                  * We can use non atomic operation since we own the queue lock
3095                  */
3096                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3097                         if (!rps_ipi_queued(sd))
3098                                 ____napi_schedule(sd, &sd->backlog);
3099                 }
3100                 goto enqueue;
3101         }
3102
3103         sd->dropped++;
3104         rps_unlock(sd);
3105
3106         local_irq_restore(flags);
3107
3108         atomic_long_inc(&skb->dev->rx_dropped);
3109         kfree_skb(skb);
3110         return NET_RX_DROP;
3111 }
3112
3113 /**
3114  *      netif_rx        -       post buffer to the network code
3115  *      @skb: buffer to post
3116  *
3117  *      This function receives a packet from a device driver and queues it for
3118  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3119  *      may be dropped during processing for congestion control or by the
3120  *      protocol layers.
3121  *
3122  *      return values:
3123  *      NET_RX_SUCCESS  (no congestion)
3124  *      NET_RX_DROP     (packet was dropped)
3125  *
3126  */
3127
3128 int netif_rx(struct sk_buff *skb)
3129 {
3130         int ret;
3131
3132         /* if netpoll wants it, pretend we never saw it */
3133         if (netpoll_rx(skb))
3134                 return NET_RX_DROP;
3135
3136         net_timestamp_check(netdev_tstamp_prequeue, skb);
3137
3138         trace_netif_rx(skb);
3139 #ifdef CONFIG_RPS
3140         if (static_key_false(&rps_needed)) {
3141                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3142                 int cpu;
3143
3144                 preempt_disable();
3145                 rcu_read_lock();
3146
3147                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3148                 if (cpu < 0)
3149                         cpu = smp_processor_id();
3150
3151                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3152
3153                 rcu_read_unlock();
3154                 preempt_enable();
3155         } else
3156 #endif
3157         {
3158                 unsigned int qtail;
3159                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3160                 put_cpu();
3161         }
3162         return ret;
3163 }
3164 EXPORT_SYMBOL(netif_rx);
3165
3166 int netif_rx_ni(struct sk_buff *skb)
3167 {
3168         int err;
3169
3170         preempt_disable();
3171         err = netif_rx(skb);
3172         if (local_softirq_pending())
3173                 do_softirq();
3174         preempt_enable();
3175
3176         return err;
3177 }
3178 EXPORT_SYMBOL(netif_rx_ni);
3179
3180 static void net_tx_action(struct softirq_action *h)
3181 {
3182         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3183
3184         if (sd->completion_queue) {
3185                 struct sk_buff *clist;
3186
3187                 local_irq_disable();
3188                 clist = sd->completion_queue;
3189                 sd->completion_queue = NULL;
3190                 local_irq_enable();
3191
3192                 while (clist) {
3193                         struct sk_buff *skb = clist;
3194                         clist = clist->next;
3195
3196                         WARN_ON(atomic_read(&skb->users));
3197                         trace_kfree_skb(skb, net_tx_action);
3198                         __kfree_skb(skb);
3199                 }
3200         }
3201
3202         if (sd->output_queue) {
3203                 struct Qdisc *head;
3204
3205                 local_irq_disable();
3206                 head = sd->output_queue;
3207                 sd->output_queue = NULL;
3208                 sd->output_queue_tailp = &sd->output_queue;
3209                 local_irq_enable();
3210
3211                 while (head) {
3212                         struct Qdisc *q = head;
3213                         spinlock_t *root_lock;
3214
3215                         head = head->next_sched;
3216
3217                         root_lock = qdisc_lock(q);
3218                         if (spin_trylock(root_lock)) {
3219                                 smp_mb__before_clear_bit();
3220                                 clear_bit(__QDISC_STATE_SCHED,
3221                                           &q->state);
3222                                 qdisc_run(q);
3223                                 spin_unlock(root_lock);
3224                         } else {
3225                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3226                                               &q->state)) {
3227                                         __netif_reschedule(q);
3228                                 } else {
3229                                         smp_mb__before_clear_bit();
3230                                         clear_bit(__QDISC_STATE_SCHED,
3231                                                   &q->state);
3232                                 }
3233                         }
3234                 }
3235         }
3236 }
3237
3238 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3239     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3240 /* This hook is defined here for ATM LANE */
3241 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3242                              unsigned char *addr) __read_mostly;
3243 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3244 #endif
3245
3246 #ifdef CONFIG_NET_CLS_ACT
3247 /* TODO: Maybe we should just force sch_ingress to be compiled in
3248  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3249  * a compare and 2 stores extra right now if we dont have it on
3250  * but have CONFIG_NET_CLS_ACT
3251  * NOTE: This doesn't stop any functionality; if you dont have
3252  * the ingress scheduler, you just can't add policies on ingress.
3253  *
3254  */
3255 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3256 {
3257         struct net_device *dev = skb->dev;
3258         u32 ttl = G_TC_RTTL(skb->tc_verd);
3259         int result = TC_ACT_OK;
3260         struct Qdisc *q;
3261
3262         if (unlikely(MAX_RED_LOOP < ttl++)) {
3263                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3264                                      skb->skb_iif, dev->ifindex);
3265                 return TC_ACT_SHOT;
3266         }
3267
3268         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3269         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3270
3271         q = rxq->qdisc;
3272         if (q != &noop_qdisc) {
3273                 spin_lock(qdisc_lock(q));
3274                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3275                         result = qdisc_enqueue_root(skb, q);
3276                 spin_unlock(qdisc_lock(q));
3277         }
3278
3279         return result;
3280 }
3281
3282 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3283                                          struct packet_type **pt_prev,
3284                                          int *ret, struct net_device *orig_dev)
3285 {
3286         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3287
3288         if (!rxq || rxq->qdisc == &noop_qdisc)
3289                 goto out;
3290
3291         if (*pt_prev) {
3292                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3293                 *pt_prev = NULL;
3294         }
3295
3296         switch (ing_filter(skb, rxq)) {
3297         case TC_ACT_SHOT:
3298         case TC_ACT_STOLEN:
3299                 kfree_skb(skb);
3300                 return NULL;
3301         }
3302
3303 out:
3304         skb->tc_verd = 0;
3305         return skb;
3306 }
3307 #endif
3308
3309 /**
3310  *      netdev_rx_handler_register - register receive handler
3311  *      @dev: device to register a handler for
3312  *      @rx_handler: receive handler to register
3313  *      @rx_handler_data: data pointer that is used by rx handler
3314  *
3315  *      Register a receive hander for a device. This handler will then be
3316  *      called from __netif_receive_skb. A negative errno code is returned
3317  *      on a failure.
3318  *
3319  *      The caller must hold the rtnl_mutex.
3320  *
3321  *      For a general description of rx_handler, see enum rx_handler_result.
3322  */
3323 int netdev_rx_handler_register(struct net_device *dev,
3324                                rx_handler_func_t *rx_handler,
3325                                void *rx_handler_data)
3326 {
3327         ASSERT_RTNL();
3328
3329         if (dev->rx_handler)
3330                 return -EBUSY;
3331
3332         /* Note: rx_handler_data must be set before rx_handler */
3333         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3334         rcu_assign_pointer(dev->rx_handler, rx_handler);
3335
3336         return 0;
3337 }
3338 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3339
3340 /**
3341  *      netdev_rx_handler_unregister - unregister receive handler
3342  *      @dev: device to unregister a handler from
3343  *
3344  *      Unregister a receive handler from a device.
3345  *
3346  *      The caller must hold the rtnl_mutex.
3347  */
3348 void netdev_rx_handler_unregister(struct net_device *dev)
3349 {
3350
3351         ASSERT_RTNL();
3352         RCU_INIT_POINTER(dev->rx_handler, NULL);
3353         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3354          * section has a guarantee to see a non NULL rx_handler_data
3355          * as well.
3356          */
3357         synchronize_net();
3358         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3359 }
3360 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3361
3362 /*
3363  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3364  * the special handling of PFMEMALLOC skbs.
3365  */
3366 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3367 {
3368         switch (skb->protocol) {
3369         case __constant_htons(ETH_P_ARP):
3370         case __constant_htons(ETH_P_IP):
3371         case __constant_htons(ETH_P_IPV6):
3372         case __constant_htons(ETH_P_8021Q):
3373         case __constant_htons(ETH_P_8021AD):
3374                 return true;
3375         default:
3376                 return false;
3377         }
3378 }
3379
3380 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3381 {
3382         struct packet_type *ptype, *pt_prev;
3383         rx_handler_func_t *rx_handler;
3384         struct net_device *orig_dev;
3385         struct net_device *null_or_dev;
3386         bool deliver_exact = false;
3387         int ret = NET_RX_DROP;
3388         __be16 type;
3389
3390         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3391
3392         trace_netif_receive_skb(skb);
3393
3394         /* if we've gotten here through NAPI, check netpoll */
3395         if (netpoll_receive_skb(skb))
3396                 goto out;
3397
3398         orig_dev = skb->dev;
3399
3400         skb_reset_network_header(skb);
3401         if (!skb_transport_header_was_set(skb))
3402                 skb_reset_transport_header(skb);
3403         skb_reset_mac_len(skb);
3404
3405         pt_prev = NULL;
3406
3407         rcu_read_lock();
3408
3409 another_round:
3410         skb->skb_iif = skb->dev->ifindex;
3411
3412         __this_cpu_inc(softnet_data.processed);
3413
3414         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3415             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3416                 skb = vlan_untag(skb);
3417                 if (unlikely(!skb))
3418                         goto unlock;
3419         }
3420
3421 #ifdef CONFIG_NET_CLS_ACT
3422         if (skb->tc_verd & TC_NCLS) {
3423                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3424                 goto ncls;
3425         }
3426 #endif
3427
3428         if (pfmemalloc)
3429                 goto skip_taps;
3430
3431         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3432                 if (!ptype->dev || ptype->dev == skb->dev) {
3433                         if (pt_prev)
3434                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3435                         pt_prev = ptype;
3436                 }
3437         }
3438
3439 skip_taps:
3440 #ifdef CONFIG_NET_CLS_ACT
3441         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3442         if (!skb)
3443                 goto unlock;
3444 ncls:
3445 #endif
3446
3447         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3448                 goto drop;
3449
3450         if (vlan_tx_tag_present(skb)) {
3451                 if (pt_prev) {
3452                         ret = deliver_skb(skb, pt_prev, orig_dev);
3453                         pt_prev = NULL;
3454                 }
3455                 if (vlan_do_receive(&skb))
3456                         goto another_round;
3457                 else if (unlikely(!skb))
3458                         goto unlock;
3459         }
3460
3461         rx_handler = rcu_dereference(skb->dev->rx_handler);
3462         if (rx_handler) {
3463                 if (pt_prev) {
3464                         ret = deliver_skb(skb, pt_prev, orig_dev);
3465                         pt_prev = NULL;
3466                 }
3467                 switch (rx_handler(&skb)) {
3468                 case RX_HANDLER_CONSUMED:
3469                         ret = NET_RX_SUCCESS;
3470                         goto unlock;
3471                 case RX_HANDLER_ANOTHER:
3472                         goto another_round;
3473                 case RX_HANDLER_EXACT:
3474                         deliver_exact = true;
3475                 case RX_HANDLER_PASS:
3476                         break;
3477                 default:
3478                         BUG();
3479                 }
3480         }
3481
3482         if (vlan_tx_nonzero_tag_present(skb))
3483                 skb->pkt_type = PACKET_OTHERHOST;
3484
3485         /* deliver only exact match when indicated */
3486         null_or_dev = deliver_exact ? skb->dev : NULL;
3487
3488         type = skb->protocol;
3489         list_for_each_entry_rcu(ptype,
3490                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3491                 if (ptype->type == type &&
3492                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3493                      ptype->dev == orig_dev)) {
3494                         if (pt_prev)
3495                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3496                         pt_prev = ptype;
3497                 }
3498         }
3499
3500         if (pt_prev) {
3501                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3502                         goto drop;
3503                 else
3504                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3505         } else {
3506 drop:
3507                 atomic_long_inc(&skb->dev->rx_dropped);
3508                 kfree_skb(skb);
3509                 /* Jamal, now you will not able to escape explaining
3510                  * me how you were going to use this. :-)
3511                  */
3512                 ret = NET_RX_DROP;
3513         }
3514
3515 unlock:
3516         rcu_read_unlock();
3517 out:
3518         return ret;
3519 }
3520
3521 static int __netif_receive_skb(struct sk_buff *skb)
3522 {
3523         int ret;
3524
3525         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3526                 unsigned long pflags = current->flags;
3527
3528                 /*
3529                  * PFMEMALLOC skbs are special, they should
3530                  * - be delivered to SOCK_MEMALLOC sockets only
3531                  * - stay away from userspace
3532                  * - have bounded memory usage
3533                  *
3534                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3535                  * context down to all allocation sites.
3536                  */
3537                 current->flags |= PF_MEMALLOC;
3538                 ret = __netif_receive_skb_core(skb, true);
3539                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3540         } else
3541                 ret = __netif_receive_skb_core(skb, false);
3542
3543         return ret;
3544 }
3545
3546 /**
3547  *      netif_receive_skb - process receive buffer from network
3548  *      @skb: buffer to process
3549  *
3550  *      netif_receive_skb() is the main receive data processing function.
3551  *      It always succeeds. The buffer may be dropped during processing
3552  *      for congestion control or by the protocol layers.
3553  *
3554  *      This function may only be called from softirq context and interrupts
3555  *      should be enabled.
3556  *
3557  *      Return values (usually ignored):
3558  *      NET_RX_SUCCESS: no congestion
3559  *      NET_RX_DROP: packet was dropped
3560  */
3561 int netif_receive_skb(struct sk_buff *skb)
3562 {
3563         net_timestamp_check(netdev_tstamp_prequeue, skb);
3564
3565         if (skb_defer_rx_timestamp(skb))
3566                 return NET_RX_SUCCESS;
3567
3568 #ifdef CONFIG_RPS
3569         if (static_key_false(&rps_needed)) {
3570                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3571                 int cpu, ret;
3572
3573                 rcu_read_lock();
3574
3575                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3576
3577                 if (cpu >= 0) {
3578                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3579                         rcu_read_unlock();
3580                         return ret;
3581                 }
3582                 rcu_read_unlock();
3583         }
3584 #endif
3585         return __netif_receive_skb(skb);
3586 }
3587 EXPORT_SYMBOL(netif_receive_skb);
3588
3589 /* Network device is going away, flush any packets still pending
3590  * Called with irqs disabled.
3591  */
3592 static void flush_backlog(void *arg)
3593 {
3594         struct net_device *dev = arg;
3595         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3596         struct sk_buff *skb, *tmp;
3597
3598         rps_lock(sd);
3599         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3600                 if (skb->dev == dev) {
3601                         __skb_unlink(skb, &sd->input_pkt_queue);
3602                         kfree_skb(skb);
3603                         input_queue_head_incr(sd);
3604                 }
3605         }
3606         rps_unlock(sd);
3607
3608         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3609                 if (skb->dev == dev) {
3610                         __skb_unlink(skb, &sd->process_queue);
3611                         kfree_skb(skb);
3612                         input_queue_head_incr(sd);
3613                 }
3614         }
3615 }
3616
3617 static int napi_gro_complete(struct sk_buff *skb)
3618 {
3619         struct packet_offload *ptype;
3620         __be16 type = skb->protocol;
3621         struct list_head *head = &offload_base;
3622         int err = -ENOENT;
3623
3624         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3625
3626         if (NAPI_GRO_CB(skb)->count == 1) {
3627                 skb_shinfo(skb)->gso_size = 0;
3628                 goto out;
3629         }
3630
3631         rcu_read_lock();
3632         list_for_each_entry_rcu(ptype, head, list) {
3633                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3634                         continue;
3635
3636                 err = ptype->callbacks.gro_complete(skb);
3637                 break;
3638         }
3639         rcu_read_unlock();
3640
3641         if (err) {
3642                 WARN_ON(&ptype->list == head);
3643                 kfree_skb(skb);
3644                 return NET_RX_SUCCESS;
3645         }
3646
3647 out:
3648         return netif_receive_skb(skb);
3649 }
3650
3651 /* napi->gro_list contains packets ordered by age.
3652  * youngest packets at the head of it.
3653  * Complete skbs in reverse order to reduce latencies.
3654  */
3655 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3656 {
3657         struct sk_buff *skb, *prev = NULL;
3658
3659         /* scan list and build reverse chain */
3660         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3661                 skb->prev = prev;
3662                 prev = skb;
3663         }
3664
3665         for (skb = prev; skb; skb = prev) {
3666                 skb->next = NULL;
3667
3668                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3669                         return;
3670
3671                 prev = skb->prev;
3672                 napi_gro_complete(skb);
3673                 napi->gro_count--;
3674         }
3675
3676         napi->gro_list = NULL;
3677 }
3678 EXPORT_SYMBOL(napi_gro_flush);
3679
3680 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3681 {
3682         struct sk_buff *p;
3683         unsigned int maclen = skb->dev->hard_header_len;
3684
3685         for (p = napi->gro_list; p; p = p->next) {
3686                 unsigned long diffs;
3687
3688                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3689                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3690                 if (maclen == ETH_HLEN)
3691                         diffs |= compare_ether_header(skb_mac_header(p),
3692                                                       skb_gro_mac_header(skb));
3693                 else if (!diffs)
3694                         diffs = memcmp(skb_mac_header(p),
3695                                        skb_gro_mac_header(skb),
3696                                        maclen);
3697                 NAPI_GRO_CB(p)->same_flow = !diffs;
3698                 NAPI_GRO_CB(p)->flush = 0;
3699         }
3700 }
3701
3702 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3703 {
3704         struct sk_buff **pp = NULL;
3705         struct packet_offload *ptype;
3706         __be16 type = skb->protocol;
3707         struct list_head *head = &offload_base;
3708         int same_flow;
3709         enum gro_result ret;
3710
3711         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3712                 goto normal;
3713
3714         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3715                 goto normal;
3716
3717         gro_list_prepare(napi, skb);
3718
3719         rcu_read_lock();
3720         list_for_each_entry_rcu(ptype, head, list) {
3721                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3722                         continue;
3723
3724                 skb_set_network_header(skb, skb_gro_offset(skb));
3725                 skb_reset_mac_len(skb);
3726                 NAPI_GRO_CB(skb)->same_flow = 0;
3727                 NAPI_GRO_CB(skb)->flush = 0;
3728                 NAPI_GRO_CB(skb)->free = 0;
3729
3730                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3731                 break;
3732         }
3733         rcu_read_unlock();
3734
3735         if (&ptype->list == head)
3736                 goto normal;
3737
3738         same_flow = NAPI_GRO_CB(skb)->same_flow;
3739         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3740
3741         if (pp) {
3742                 struct sk_buff *nskb = *pp;
3743
3744                 *pp = nskb->next;
3745                 nskb->next = NULL;
3746                 napi_gro_complete(nskb);
3747                 napi->gro_count--;
3748         }
3749
3750         if (same_flow)
3751                 goto ok;
3752
3753         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3754                 goto normal;
3755
3756         napi->gro_count++;
3757         NAPI_GRO_CB(skb)->count = 1;
3758         NAPI_GRO_CB(skb)->age = jiffies;
3759         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3760         skb->next = napi->gro_list;
3761         napi->gro_list = skb;
3762         ret = GRO_HELD;
3763
3764 pull:
3765         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3766                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3767
3768                 BUG_ON(skb->end - skb->tail < grow);
3769
3770                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3771
3772                 skb->tail += grow;
3773                 skb->data_len -= grow;
3774
3775                 skb_shinfo(skb)->frags[0].page_offset += grow;
3776                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3777
3778                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3779                         skb_frag_unref(skb, 0);
3780                         memmove(skb_shinfo(skb)->frags,
3781                                 skb_shinfo(skb)->frags + 1,
3782                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3783                 }
3784         }
3785
3786 ok:
3787         return ret;
3788
3789 normal:
3790         ret = GRO_NORMAL;
3791         goto pull;
3792 }
3793
3794
3795 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3796 {
3797         switch (ret) {
3798         case GRO_NORMAL:
3799                 if (netif_receive_skb(skb))
3800                         ret = GRO_DROP;
3801                 break;
3802
3803         case GRO_DROP:
3804                 kfree_skb(skb);
3805                 break;
3806
3807         case GRO_MERGED_FREE:
3808                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3809                         kmem_cache_free(skbuff_head_cache, skb);
3810                 else
3811                         __kfree_skb(skb);
3812                 break;
3813
3814         case GRO_HELD:
3815         case GRO_MERGED:
3816                 break;
3817         }
3818
3819         return ret;
3820 }
3821
3822 static void skb_gro_reset_offset(struct sk_buff *skb)
3823 {
3824         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3825         const skb_frag_t *frag0 = &pinfo->frags[0];
3826
3827         NAPI_GRO_CB(skb)->data_offset = 0;
3828         NAPI_GRO_CB(skb)->frag0 = NULL;
3829         NAPI_GRO_CB(skb)->frag0_len = 0;
3830
3831         if (skb->mac_header == skb->tail &&
3832             pinfo->nr_frags &&
3833             !PageHighMem(skb_frag_page(frag0))) {
3834                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3835                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3836         }
3837 }
3838
3839 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3840 {
3841         skb_gro_reset_offset(skb);
3842
3843         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3844 }
3845 EXPORT_SYMBOL(napi_gro_receive);
3846
3847 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3848 {
3849         __skb_pull(skb, skb_headlen(skb));
3850         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3851         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3852         skb->vlan_tci = 0;
3853         skb->dev = napi->dev;
3854         skb->skb_iif = 0;
3855
3856         napi->skb = skb;
3857 }
3858
3859 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3860 {
3861         struct sk_buff *skb = napi->skb;
3862
3863         if (!skb) {
3864                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3865                 if (skb)
3866                         napi->skb = skb;
3867         }
3868         return skb;
3869 }
3870 EXPORT_SYMBOL(napi_get_frags);
3871
3872 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3873                                gro_result_t ret)
3874 {
3875         switch (ret) {
3876         case GRO_NORMAL:
3877         case GRO_HELD:
3878                 skb->protocol = eth_type_trans(skb, skb->dev);
3879
3880                 if (ret == GRO_HELD)
3881                         skb_gro_pull(skb, -ETH_HLEN);
3882                 else if (netif_receive_skb(skb))
3883                         ret = GRO_DROP;
3884                 break;
3885
3886         case GRO_DROP:
3887         case GRO_MERGED_FREE:
3888                 napi_reuse_skb(napi, skb);
3889                 break;
3890
3891         case GRO_MERGED:
3892                 break;
3893         }
3894
3895         return ret;
3896 }
3897
3898 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3899 {
3900         struct sk_buff *skb = napi->skb;
3901         struct ethhdr *eth;
3902         unsigned int hlen;
3903         unsigned int off;
3904
3905         napi->skb = NULL;
3906
3907         skb_reset_mac_header(skb);
3908         skb_gro_reset_offset(skb);
3909
3910         off = skb_gro_offset(skb);
3911         hlen = off + sizeof(*eth);
3912         eth = skb_gro_header_fast(skb, off);
3913         if (skb_gro_header_hard(skb, hlen)) {
3914                 eth = skb_gro_header_slow(skb, hlen, off);
3915                 if (unlikely(!eth)) {
3916                         napi_reuse_skb(napi, skb);
3917                         skb = NULL;
3918                         goto out;
3919                 }
3920         }
3921
3922         skb_gro_pull(skb, sizeof(*eth));
3923
3924         /*
3925          * This works because the only protocols we care about don't require
3926          * special handling.  We'll fix it up properly at the end.
3927          */
3928         skb->protocol = eth->h_proto;
3929
3930 out:
3931         return skb;
3932 }
3933
3934 gro_result_t napi_gro_frags(struct napi_struct *napi)
3935 {
3936         struct sk_buff *skb = napi_frags_skb(napi);
3937
3938         if (!skb)
3939                 return GRO_DROP;
3940
3941         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
3942 }
3943 EXPORT_SYMBOL(napi_gro_frags);
3944
3945 /*
3946  * net_rps_action sends any pending IPI's for rps.
3947  * Note: called with local irq disabled, but exits with local irq enabled.
3948  */
3949 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3950 {
3951 #ifdef CONFIG_RPS
3952         struct softnet_data *remsd = sd->rps_ipi_list;
3953
3954         if (remsd) {
3955                 sd->rps_ipi_list = NULL;
3956
3957                 local_irq_enable();
3958
3959                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3960                 while (remsd) {
3961                         struct softnet_data *next = remsd->rps_ipi_next;
3962
3963                         if (cpu_online(remsd->cpu))
3964                                 __smp_call_function_single(remsd->cpu,
3965                                                            &remsd->csd, 0);
3966                         remsd = next;
3967                 }
3968         } else
3969 #endif
3970                 local_irq_enable();
3971 }
3972
3973 static int process_backlog(struct napi_struct *napi, int quota)
3974 {
3975         int work = 0;
3976         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3977
3978 #ifdef CONFIG_RPS
3979         /* Check if we have pending ipi, its better to send them now,
3980          * not waiting net_rx_action() end.
3981          */
3982         if (sd->rps_ipi_list) {
3983                 local_irq_disable();
3984                 net_rps_action_and_irq_enable(sd);
3985         }
3986 #endif
3987         napi->weight = weight_p;
3988         local_irq_disable();
3989         while (work < quota) {
3990                 struct sk_buff *skb;
3991                 unsigned int qlen;
3992
3993                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3994                         local_irq_enable();
3995                         __netif_receive_skb(skb);
3996                         local_irq_disable();
3997                         input_queue_head_incr(sd);
3998                         if (++work >= quota) {
3999                                 local_irq_enable();
4000                                 return work;
4001                         }
4002                 }
4003
4004                 rps_lock(sd);
4005                 qlen = skb_queue_len(&sd->input_pkt_queue);
4006                 if (qlen)
4007                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4008                                                    &sd->process_queue);
4009
4010                 if (qlen < quota - work) {
4011                         /*
4012                          * Inline a custom version of __napi_complete().
4013                          * only current cpu owns and manipulates this napi,
4014                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4015                          * we can use a plain write instead of clear_bit(),
4016                          * and we dont need an smp_mb() memory barrier.
4017                          */
4018                         list_del(&napi->poll_list);
4019                         napi->state = 0;
4020
4021                         quota = work + qlen;
4022                 }
4023                 rps_unlock(sd);
4024         }
4025         local_irq_enable();
4026
4027         return work;
4028 }
4029
4030 /**
4031  * __napi_schedule - schedule for receive
4032  * @n: entry to schedule
4033  *
4034  * The entry's receive function will be scheduled to run
4035  */
4036 void __napi_schedule(struct napi_struct *n)
4037 {
4038         unsigned long flags;
4039
4040         local_irq_save(flags);
4041         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4042         local_irq_restore(flags);
4043 }
4044 EXPORT_SYMBOL(__napi_schedule);
4045
4046 void __napi_complete(struct napi_struct *n)
4047 {
4048         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4049         BUG_ON(n->gro_list);
4050
4051         list_del(&n->poll_list);
4052         smp_mb__before_clear_bit();
4053         clear_bit(NAPI_STATE_SCHED, &n->state);
4054 }
4055 EXPORT_SYMBOL(__napi_complete);
4056
4057 void napi_complete(struct napi_struct *n)
4058 {
4059         unsigned long flags;
4060
4061         /*
4062          * don't let napi dequeue from the cpu poll list
4063          * just in case its running on a different cpu
4064          */
4065         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4066                 return;
4067
4068         napi_gro_flush(n, false);
4069         local_irq_save(flags);
4070         __napi_complete(n);
4071         local_irq_restore(flags);
4072 }
4073 EXPORT_SYMBOL(napi_complete);
4074
4075 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4076                     int (*poll)(struct napi_struct *, int), int weight)
4077 {
4078         INIT_LIST_HEAD(&napi->poll_list);
4079         napi->gro_count = 0;
4080         napi->gro_list = NULL;
4081         napi->skb = NULL;
4082         napi->poll = poll;
4083         if (weight > NAPI_POLL_WEIGHT)
4084                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4085                             weight, dev->name);
4086         napi->weight = weight;
4087         list_add(&napi->dev_list, &dev->napi_list);
4088         napi->dev = dev;
4089 #ifdef CONFIG_NETPOLL
4090         spin_lock_init(&napi->poll_lock);
4091         napi->poll_owner = -1;
4092 #endif
4093         set_bit(NAPI_STATE_SCHED, &napi->state);
4094 }
4095 EXPORT_SYMBOL(netif_napi_add);
4096
4097 void netif_napi_del(struct napi_struct *napi)
4098 {
4099         struct sk_buff *skb, *next;
4100
4101         list_del_init(&napi->dev_list);
4102         napi_free_frags(napi);
4103
4104         for (skb = napi->gro_list; skb; skb = next) {
4105                 next = skb->next;
4106                 skb->next = NULL;
4107                 kfree_skb(skb);
4108         }
4109
4110         napi->gro_list = NULL;
4111         napi->gro_count = 0;
4112 }
4113 EXPORT_SYMBOL(netif_napi_del);
4114
4115 static void net_rx_action(struct softirq_action *h)
4116 {
4117         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4118         unsigned long time_limit = jiffies + 2;
4119         int budget = netdev_budget;
4120         void *have;
4121
4122         local_irq_disable();
4123
4124         while (!list_empty(&sd->poll_list)) {
4125                 struct napi_struct *n;
4126                 int work, weight;
4127
4128                 /* If softirq window is exhuasted then punt.
4129                  * Allow this to run for 2 jiffies since which will allow
4130                  * an average latency of 1.5/HZ.
4131                  */
4132                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4133                         goto softnet_break;
4134
4135                 local_irq_enable();
4136
4137                 /* Even though interrupts have been re-enabled, this
4138                  * access is safe because interrupts can only add new
4139                  * entries to the tail of this list, and only ->poll()
4140                  * calls can remove this head entry from the list.
4141                  */
4142                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4143
4144                 have = netpoll_poll_lock(n);
4145
4146                 weight = n->weight;
4147
4148                 /* This NAPI_STATE_SCHED test is for avoiding a race
4149                  * with netpoll's poll_napi().  Only the entity which
4150                  * obtains the lock and sees NAPI_STATE_SCHED set will
4151                  * actually make the ->poll() call.  Therefore we avoid
4152                  * accidentally calling ->poll() when NAPI is not scheduled.
4153                  */
4154                 work = 0;
4155                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4156                         work = n->poll(n, weight);
4157                         trace_napi_poll(n);
4158                 }
4159
4160                 WARN_ON_ONCE(work > weight);
4161
4162                 budget -= work;
4163
4164                 local_irq_disable();
4165
4166                 /* Drivers must not modify the NAPI state if they
4167                  * consume the entire weight.  In such cases this code
4168                  * still "owns" the NAPI instance and therefore can
4169                  * move the instance around on the list at-will.
4170                  */
4171                 if (unlikely(work == weight)) {
4172                         if (unlikely(napi_disable_pending(n))) {
4173                                 local_irq_enable();
4174                                 napi_complete(n);
4175                                 local_irq_disable();
4176                         } else {
4177                                 if (n->gro_list) {
4178                                         /* flush too old packets
4179                                          * If HZ < 1000, flush all packets.
4180                                          */
4181                                         local_irq_enable();
4182                                         napi_gro_flush(n, HZ >= 1000);
4183                                         local_irq_disable();
4184                                 }
4185                                 list_move_tail(&n->poll_list, &sd->poll_list);
4186                         }
4187                 }
4188
4189                 netpoll_poll_unlock(have);
4190         }
4191 out:
4192         net_rps_action_and_irq_enable(sd);
4193
4194 #ifdef CONFIG_NET_DMA
4195         /*
4196          * There may not be any more sk_buffs coming right now, so push
4197          * any pending DMA copies to hardware
4198          */
4199         dma_issue_pending_all();
4200 #endif
4201
4202         return;
4203
4204 softnet_break:
4205         sd->time_squeeze++;
4206         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4207         goto out;
4208 }
4209
4210 struct netdev_upper {
4211         struct net_device *dev;
4212         bool master;
4213         struct list_head list;
4214         struct rcu_head rcu;
4215         struct list_head search_list;
4216 };
4217
4218 static void __append_search_uppers(struct list_head *search_list,
4219                                    struct net_device *dev)
4220 {
4221         struct netdev_upper *upper;
4222
4223         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4224                 /* check if this upper is not already in search list */
4225                 if (list_empty(&upper->search_list))
4226                         list_add_tail(&upper->search_list, search_list);
4227         }
4228 }
4229
4230 static bool __netdev_search_upper_dev(struct net_device *dev,
4231                                       struct net_device *upper_dev)
4232 {
4233         LIST_HEAD(search_list);
4234         struct netdev_upper *upper;
4235         struct netdev_upper *tmp;
4236         bool ret = false;
4237
4238         __append_search_uppers(&search_list, dev);
4239         list_for_each_entry(upper, &search_list, search_list) {
4240                 if (upper->dev == upper_dev) {
4241                         ret = true;
4242                         break;
4243                 }
4244                 __append_search_uppers(&search_list, upper->dev);
4245         }
4246         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4247                 INIT_LIST_HEAD(&upper->search_list);
4248         return ret;
4249 }
4250
4251 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4252                                                 struct net_device *upper_dev)
4253 {
4254         struct netdev_upper *upper;
4255
4256         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4257                 if (upper->dev == upper_dev)
4258                         return upper;
4259         }
4260         return NULL;
4261 }
4262
4263 /**
4264  * netdev_has_upper_dev - Check if device is linked to an upper device
4265  * @dev: device
4266  * @upper_dev: upper device to check
4267  *
4268  * Find out if a device is linked to specified upper device and return true
4269  * in case it is. Note that this checks only immediate upper device,
4270  * not through a complete stack of devices. The caller must hold the RTNL lock.
4271  */
4272 bool netdev_has_upper_dev(struct net_device *dev,
4273                           struct net_device *upper_dev)
4274 {
4275         ASSERT_RTNL();
4276
4277         return __netdev_find_upper(dev, upper_dev);
4278 }
4279 EXPORT_SYMBOL(netdev_has_upper_dev);
4280
4281 /**
4282  * netdev_has_any_upper_dev - Check if device is linked to some device
4283  * @dev: device
4284  *
4285  * Find out if a device is linked to an upper device and return true in case
4286  * it is. The caller must hold the RTNL lock.
4287  */
4288 bool netdev_has_any_upper_dev(struct net_device *dev)
4289 {
4290         ASSERT_RTNL();
4291
4292         return !list_empty(&dev->upper_dev_list);
4293 }
4294 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4295
4296 /**
4297  * netdev_master_upper_dev_get - Get master upper device
4298  * @dev: device
4299  *
4300  * Find a master upper device and return pointer to it or NULL in case
4301  * it's not there. The caller must hold the RTNL lock.
4302  */
4303 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4304 {
4305         struct netdev_upper *upper;
4306
4307         ASSERT_RTNL();
4308
4309         if (list_empty(&dev->upper_dev_list))
4310                 return NULL;
4311
4312         upper = list_first_entry(&dev->upper_dev_list,
4313                                  struct netdev_upper, list);
4314         if (likely(upper->master))
4315                 return upper->dev;
4316         return NULL;
4317 }
4318 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4319
4320 /**
4321  * netdev_master_upper_dev_get_rcu - Get master upper device
4322  * @dev: device
4323  *
4324  * Find a master upper device and return pointer to it or NULL in case
4325  * it's not there. The caller must hold the RCU read lock.
4326  */
4327 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4328 {
4329         struct netdev_upper *upper;
4330
4331         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4332                                        struct netdev_upper, list);
4333         if (upper && likely(upper->master))
4334                 return upper->dev;
4335         return NULL;
4336 }
4337 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4338
4339 static int __netdev_upper_dev_link(struct net_device *dev,
4340                                    struct net_device *upper_dev, bool master)
4341 {
4342         struct netdev_upper *upper;
4343
4344         ASSERT_RTNL();
4345
4346         if (dev == upper_dev)
4347                 return -EBUSY;
4348
4349         /* To prevent loops, check if dev is not upper device to upper_dev. */
4350         if (__netdev_search_upper_dev(upper_dev, dev))
4351                 return -EBUSY;
4352
4353         if (__netdev_find_upper(dev, upper_dev))
4354                 return -EEXIST;
4355
4356         if (master && netdev_master_upper_dev_get(dev))
4357                 return -EBUSY;
4358
4359         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4360         if (!upper)
4361                 return -ENOMEM;
4362
4363         upper->dev = upper_dev;
4364         upper->master = master;
4365         INIT_LIST_HEAD(&upper->search_list);
4366
4367         /* Ensure that master upper link is always the first item in list. */
4368         if (master)
4369                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4370         else
4371                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4372         dev_hold(upper_dev);
4373
4374         return 0;
4375 }
4376
4377 /**
4378  * netdev_upper_dev_link - Add a link to the upper device
4379  * @dev: device
4380  * @upper_dev: new upper device
4381  *
4382  * Adds a link to device which is upper to this one. The caller must hold
4383  * the RTNL lock. On a failure a negative errno code is returned.
4384  * On success the reference counts are adjusted and the function
4385  * returns zero.
4386  */
4387 int netdev_upper_dev_link(struct net_device *dev,
4388                           struct net_device *upper_dev)
4389 {
4390         return __netdev_upper_dev_link(dev, upper_dev, false);
4391 }
4392 EXPORT_SYMBOL(netdev_upper_dev_link);
4393
4394 /**
4395  * netdev_master_upper_dev_link - Add a master link to the upper device
4396  * @dev: device
4397  * @upper_dev: new upper device
4398  *
4399  * Adds a link to device which is upper to this one. In this case, only
4400  * one master upper device can be linked, although other non-master devices
4401  * might be linked as well. The caller must hold the RTNL lock.
4402  * On a failure a negative errno code is returned. On success the reference
4403  * counts are adjusted and the function returns zero.
4404  */
4405 int netdev_master_upper_dev_link(struct net_device *dev,
4406                                  struct net_device *upper_dev)
4407 {
4408         return __netdev_upper_dev_link(dev, upper_dev, true);
4409 }
4410 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4411
4412 /**
4413  * netdev_upper_dev_unlink - Removes a link to upper device
4414  * @dev: device
4415  * @upper_dev: new upper device
4416  *
4417  * Removes a link to device which is upper to this one. The caller must hold
4418  * the RTNL lock.
4419  */
4420 void netdev_upper_dev_unlink(struct net_device *dev,
4421                              struct net_device *upper_dev)
4422 {
4423         struct netdev_upper *upper;
4424
4425         ASSERT_RTNL();
4426
4427         upper = __netdev_find_upper(dev, upper_dev);
4428         if (!upper)
4429                 return;
4430         list_del_rcu(&upper->list);
4431         dev_put(upper_dev);
4432         kfree_rcu(upper, rcu);
4433 }
4434 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4435
4436 static void dev_change_rx_flags(struct net_device *dev, int flags)
4437 {
4438         const struct net_device_ops *ops = dev->netdev_ops;
4439
4440         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4441                 ops->ndo_change_rx_flags(dev, flags);
4442 }
4443
4444 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4445 {
4446         unsigned int old_flags = dev->flags;
4447         kuid_t uid;
4448         kgid_t gid;
4449
4450         ASSERT_RTNL();
4451
4452         dev->flags |= IFF_PROMISC;
4453         dev->promiscuity += inc;
4454         if (dev->promiscuity == 0) {
4455                 /*
4456                  * Avoid overflow.
4457                  * If inc causes overflow, untouch promisc and return error.
4458                  */
4459                 if (inc < 0)
4460                         dev->flags &= ~IFF_PROMISC;
4461                 else {
4462                         dev->promiscuity -= inc;
4463                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4464                                 dev->name);
4465                         return -EOVERFLOW;
4466                 }
4467         }
4468         if (dev->flags != old_flags) {
4469                 pr_info("device %s %s promiscuous mode\n",
4470                         dev->name,
4471                         dev->flags & IFF_PROMISC ? "entered" : "left");
4472                 if (audit_enabled) {
4473                         current_uid_gid(&uid, &gid);
4474                         audit_log(current->audit_context, GFP_ATOMIC,
4475                                 AUDIT_ANOM_PROMISCUOUS,
4476                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4477                                 dev->name, (dev->flags & IFF_PROMISC),
4478                                 (old_flags & IFF_PROMISC),
4479                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4480                                 from_kuid(&init_user_ns, uid),
4481                                 from_kgid(&init_user_ns, gid),
4482                                 audit_get_sessionid(current));
4483                 }
4484
4485                 dev_change_rx_flags(dev, IFF_PROMISC);
4486         }
4487         return 0;
4488 }
4489
4490 /**
4491  *      dev_set_promiscuity     - update promiscuity count on a device
4492  *      @dev: device
4493  *      @inc: modifier
4494  *
4495  *      Add or remove promiscuity from a device. While the count in the device
4496  *      remains above zero the interface remains promiscuous. Once it hits zero
4497  *      the device reverts back to normal filtering operation. A negative inc
4498  *      value is used to drop promiscuity on the device.
4499  *      Return 0 if successful or a negative errno code on error.
4500  */
4501 int dev_set_promiscuity(struct net_device *dev, int inc)
4502 {
4503         unsigned int old_flags = dev->flags;
4504         int err;
4505
4506         err = __dev_set_promiscuity(dev, inc);
4507         if (err < 0)
4508                 return err;
4509         if (dev->flags != old_flags)
4510                 dev_set_rx_mode(dev);
4511         return err;
4512 }
4513 EXPORT_SYMBOL(dev_set_promiscuity);
4514
4515 /**
4516  *      dev_set_allmulti        - update allmulti count on a device
4517  *      @dev: device
4518  *      @inc: modifier
4519  *
4520  *      Add or remove reception of all multicast frames to a device. While the
4521  *      count in the device remains above zero the interface remains listening
4522  *      to all interfaces. Once it hits zero the device reverts back to normal
4523  *      filtering operation. A negative @inc value is used to drop the counter
4524  *      when releasing a resource needing all multicasts.
4525  *      Return 0 if successful or a negative errno code on error.
4526  */
4527
4528 int dev_set_allmulti(struct net_device *dev, int inc)
4529 {
4530         unsigned int old_flags = dev->flags;
4531
4532         ASSERT_RTNL();
4533
4534         dev->flags |= IFF_ALLMULTI;
4535         dev->allmulti += inc;
4536         if (dev->allmulti == 0) {
4537                 /*
4538                  * Avoid overflow.
4539                  * If inc causes overflow, untouch allmulti and return error.
4540                  */
4541                 if (inc < 0)
4542                         dev->flags &= ~IFF_ALLMULTI;
4543                 else {
4544                         dev->allmulti -= inc;
4545                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4546                                 dev->name);
4547                         return -EOVERFLOW;
4548                 }
4549         }
4550         if (dev->flags ^ old_flags) {
4551                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4552                 dev_set_rx_mode(dev);
4553         }
4554         return 0;
4555 }
4556 EXPORT_SYMBOL(dev_set_allmulti);
4557
4558 /*
4559  *      Upload unicast and multicast address lists to device and
4560  *      configure RX filtering. When the device doesn't support unicast
4561  *      filtering it is put in promiscuous mode while unicast addresses
4562  *      are present.
4563  */
4564 void __dev_set_rx_mode(struct net_device *dev)
4565 {
4566         const struct net_device_ops *ops = dev->netdev_ops;
4567
4568         /* dev_open will call this function so the list will stay sane. */
4569         if (!(dev->flags&IFF_UP))
4570                 return;
4571
4572         if (!netif_device_present(dev))
4573                 return;
4574
4575         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4576                 /* Unicast addresses changes may only happen under the rtnl,
4577                  * therefore calling __dev_set_promiscuity here is safe.
4578                  */
4579                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4580                         __dev_set_promiscuity(dev, 1);
4581                         dev->uc_promisc = true;
4582                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4583                         __dev_set_promiscuity(dev, -1);
4584                         dev->uc_promisc = false;
4585                 }
4586         }
4587
4588         if (ops->ndo_set_rx_mode)
4589                 ops->ndo_set_rx_mode(dev);
4590 }
4591
4592 void dev_set_rx_mode(struct net_device *dev)
4593 {
4594         netif_addr_lock_bh(dev);
4595         __dev_set_rx_mode(dev);
4596         netif_addr_unlock_bh(dev);
4597 }
4598
4599 /**
4600  *      dev_get_flags - get flags reported to userspace
4601  *      @dev: device
4602  *
4603  *      Get the combination of flag bits exported through APIs to userspace.
4604  */
4605 unsigned int dev_get_flags(const struct net_device *dev)
4606 {
4607         unsigned int flags;
4608
4609         flags = (dev->flags & ~(IFF_PROMISC |
4610                                 IFF_ALLMULTI |
4611                                 IFF_RUNNING |
4612                                 IFF_LOWER_UP |
4613                                 IFF_DORMANT)) |
4614                 (dev->gflags & (IFF_PROMISC |
4615                                 IFF_ALLMULTI));
4616
4617         if (netif_running(dev)) {
4618                 if (netif_oper_up(dev))
4619                         flags |= IFF_RUNNING;
4620                 if (netif_carrier_ok(dev))
4621                         flags |= IFF_LOWER_UP;
4622                 if (netif_dormant(dev))
4623                         flags |= IFF_DORMANT;
4624         }
4625
4626         return flags;
4627 }
4628 EXPORT_SYMBOL(dev_get_flags);
4629
4630 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4631 {
4632         unsigned int old_flags = dev->flags;
4633         int ret;
4634
4635         ASSERT_RTNL();
4636
4637         /*
4638          *      Set the flags on our device.
4639          */
4640
4641         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4642                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4643                                IFF_AUTOMEDIA)) |
4644                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4645                                     IFF_ALLMULTI));
4646
4647         /*
4648          *      Load in the correct multicast list now the flags have changed.
4649          */
4650
4651         if ((old_flags ^ flags) & IFF_MULTICAST)
4652                 dev_change_rx_flags(dev, IFF_MULTICAST);
4653
4654         dev_set_rx_mode(dev);
4655
4656         /*
4657          *      Have we downed the interface. We handle IFF_UP ourselves
4658          *      according to user attempts to set it, rather than blindly
4659          *      setting it.
4660          */
4661
4662         ret = 0;
4663         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4664                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4665
4666                 if (!ret)
4667                         dev_set_rx_mode(dev);
4668         }
4669
4670         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4671                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4672
4673                 dev->gflags ^= IFF_PROMISC;
4674                 dev_set_promiscuity(dev, inc);
4675         }
4676
4677         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4678            is important. Some (broken) drivers set IFF_PROMISC, when
4679            IFF_ALLMULTI is requested not asking us and not reporting.
4680          */
4681         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4682                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4683
4684                 dev->gflags ^= IFF_ALLMULTI;
4685                 dev_set_allmulti(dev, inc);
4686         }
4687
4688         return ret;
4689 }
4690
4691 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4692 {
4693         unsigned int changes = dev->flags ^ old_flags;
4694
4695         if (changes & IFF_UP) {
4696                 if (dev->flags & IFF_UP)
4697                         call_netdevice_notifiers(NETDEV_UP, dev);
4698                 else
4699                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4700         }
4701
4702         if (dev->flags & IFF_UP &&
4703             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4704                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4705 }
4706
4707 /**
4708  *      dev_change_flags - change device settings
4709  *      @dev: device
4710  *      @flags: device state flags
4711  *
4712  *      Change settings on device based state flags. The flags are
4713  *      in the userspace exported format.
4714  */
4715 int dev_change_flags(struct net_device *dev, unsigned int flags)
4716 {
4717         int ret;
4718         unsigned int changes, old_flags = dev->flags;
4719
4720         ret = __dev_change_flags(dev, flags);
4721         if (ret < 0)
4722                 return ret;
4723
4724         changes = old_flags ^ dev->flags;
4725         if (changes)
4726                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4727
4728         __dev_notify_flags(dev, old_flags);
4729         return ret;
4730 }
4731 EXPORT_SYMBOL(dev_change_flags);
4732
4733 /**
4734  *      dev_set_mtu - Change maximum transfer unit
4735  *      @dev: device
4736  *      @new_mtu: new transfer unit
4737  *
4738  *      Change the maximum transfer size of the network device.
4739  */
4740 int dev_set_mtu(struct net_device *dev, int new_mtu)
4741 {
4742         const struct net_device_ops *ops = dev->netdev_ops;
4743         int err;
4744
4745         if (new_mtu == dev->mtu)
4746                 return 0;
4747
4748         /*      MTU must be positive.    */
4749         if (new_mtu < 0)
4750                 return -EINVAL;
4751
4752         if (!netif_device_present(dev))
4753                 return -ENODEV;
4754
4755         err = 0;
4756         if (ops->ndo_change_mtu)
4757                 err = ops->ndo_change_mtu(dev, new_mtu);
4758         else
4759                 dev->mtu = new_mtu;
4760
4761         if (!err)
4762                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4763         return err;
4764 }
4765 EXPORT_SYMBOL(dev_set_mtu);
4766
4767 /**
4768  *      dev_set_group - Change group this device belongs to
4769  *      @dev: device
4770  *      @new_group: group this device should belong to
4771  */
4772 void dev_set_group(struct net_device *dev, int new_group)
4773 {
4774         dev->group = new_group;
4775 }
4776 EXPORT_SYMBOL(dev_set_group);
4777
4778 /**
4779  *      dev_set_mac_address - Change Media Access Control Address
4780  *      @dev: device
4781  *      @sa: new address
4782  *
4783  *      Change the hardware (MAC) address of the device
4784  */
4785 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4786 {
4787         const struct net_device_ops *ops = dev->netdev_ops;
4788         int err;
4789
4790         if (!ops->ndo_set_mac_address)
4791                 return -EOPNOTSUPP;
4792         if (sa->sa_family != dev->type)
4793                 return -EINVAL;
4794         if (!netif_device_present(dev))
4795                 return -ENODEV;
4796         err = ops->ndo_set_mac_address(dev, sa);
4797         if (err)
4798                 return err;
4799         dev->addr_assign_type = NET_ADDR_SET;
4800         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4801         add_device_randomness(dev->dev_addr, dev->addr_len);
4802         return 0;
4803 }
4804 EXPORT_SYMBOL(dev_set_mac_address);
4805
4806 /**
4807  *      dev_change_carrier - Change device carrier
4808  *      @dev: device
4809  *      @new_carrier: new value
4810  *
4811  *      Change device carrier
4812  */
4813 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4814 {
4815         const struct net_device_ops *ops = dev->netdev_ops;
4816
4817         if (!ops->ndo_change_carrier)
4818                 return -EOPNOTSUPP;
4819         if (!netif_device_present(dev))
4820                 return -ENODEV;
4821         return ops->ndo_change_carrier(dev, new_carrier);
4822 }
4823 EXPORT_SYMBOL(dev_change_carrier);
4824
4825 /**
4826  *      dev_new_index   -       allocate an ifindex
4827  *      @net: the applicable net namespace
4828  *
4829  *      Returns a suitable unique value for a new device interface
4830  *      number.  The caller must hold the rtnl semaphore or the
4831  *      dev_base_lock to be sure it remains unique.
4832  */
4833 static int dev_new_index(struct net *net)
4834 {
4835         int ifindex = net->ifindex;
4836         for (;;) {
4837                 if (++ifindex <= 0)
4838                         ifindex = 1;
4839                 if (!__dev_get_by_index(net, ifindex))
4840                         return net->ifindex = ifindex;
4841         }
4842 }
4843
4844 /* Delayed registration/unregisteration */
4845 static LIST_HEAD(net_todo_list);
4846
4847 static void net_set_todo(struct net_device *dev)
4848 {
4849         list_add_tail(&dev->todo_list, &net_todo_list);
4850 }
4851
4852 static void rollback_registered_many(struct list_head *head)
4853 {
4854         struct net_device *dev, *tmp;
4855
4856         BUG_ON(dev_boot_phase);
4857         ASSERT_RTNL();
4858
4859         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4860                 /* Some devices call without registering
4861                  * for initialization unwind. Remove those
4862                  * devices and proceed with the remaining.
4863                  */
4864                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4865                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4866                                  dev->name, dev);
4867
4868                         WARN_ON(1);
4869                         list_del(&dev->unreg_list);
4870                         continue;
4871                 }
4872                 dev->dismantle = true;
4873                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4874         }
4875
4876         /* If device is running, close it first. */
4877         dev_close_many(head);
4878
4879         list_for_each_entry(dev, head, unreg_list) {
4880                 /* And unlink it from device chain. */
4881                 unlist_netdevice(dev);
4882
4883                 dev->reg_state = NETREG_UNREGISTERING;
4884         }
4885
4886         synchronize_net();
4887
4888         list_for_each_entry(dev, head, unreg_list) {
4889                 /* Shutdown queueing discipline. */
4890                 dev_shutdown(dev);
4891
4892
4893                 /* Notify protocols, that we are about to destroy
4894                    this device. They should clean all the things.
4895                 */
4896                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4897
4898                 if (!dev->rtnl_link_ops ||
4899                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4900                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4901
4902                 /*
4903                  *      Flush the unicast and multicast chains
4904                  */
4905                 dev_uc_flush(dev);
4906                 dev_mc_flush(dev);
4907
4908                 if (dev->netdev_ops->ndo_uninit)
4909                         dev->netdev_ops->ndo_uninit(dev);
4910
4911                 /* Notifier chain MUST detach us all upper devices. */
4912                 WARN_ON(netdev_has_any_upper_dev(dev));
4913
4914                 /* Remove entries from kobject tree */
4915                 netdev_unregister_kobject(dev);
4916 #ifdef CONFIG_XPS
4917                 /* Remove XPS queueing entries */
4918                 netif_reset_xps_queues_gt(dev, 0);
4919 #endif
4920         }
4921
4922         synchronize_net();
4923
4924         list_for_each_entry(dev, head, unreg_list)
4925                 dev_put(dev);
4926 }
4927
4928 static void rollback_registered(struct net_device *dev)
4929 {
4930         LIST_HEAD(single);
4931
4932         list_add(&dev->unreg_list, &single);
4933         rollback_registered_many(&single);
4934         list_del(&single);
4935 }
4936
4937 static netdev_features_t netdev_fix_features(struct net_device *dev,
4938         netdev_features_t features)
4939 {
4940         /* Fix illegal checksum combinations */
4941         if ((features & NETIF_F_HW_CSUM) &&
4942             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4943                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
4944                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4945         }
4946
4947         /* TSO requires that SG is present as well. */
4948         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
4949                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
4950                 features &= ~NETIF_F_ALL_TSO;
4951         }
4952
4953         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
4954                                         !(features & NETIF_F_IP_CSUM)) {
4955                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
4956                 features &= ~NETIF_F_TSO;
4957                 features &= ~NETIF_F_TSO_ECN;
4958         }
4959
4960         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
4961                                          !(features & NETIF_F_IPV6_CSUM)) {
4962                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
4963                 features &= ~NETIF_F_TSO6;
4964         }
4965
4966         /* TSO ECN requires that TSO is present as well. */
4967         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
4968                 features &= ~NETIF_F_TSO_ECN;
4969
4970         /* Software GSO depends on SG. */
4971         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
4972                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
4973                 features &= ~NETIF_F_GSO;
4974         }
4975
4976         /* UFO needs SG and checksumming */
4977         if (features & NETIF_F_UFO) {
4978                 /* maybe split UFO into V4 and V6? */
4979                 if (!((features & NETIF_F_GEN_CSUM) ||
4980                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
4981                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4982                         netdev_dbg(dev,
4983                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
4984                         features &= ~NETIF_F_UFO;
4985                 }
4986
4987                 if (!(features & NETIF_F_SG)) {
4988                         netdev_dbg(dev,
4989                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
4990                         features &= ~NETIF_F_UFO;
4991                 }
4992         }
4993
4994         return features;
4995 }
4996
4997 int __netdev_update_features(struct net_device *dev)
4998 {
4999         netdev_features_t features;
5000         int err = 0;
5001
5002         ASSERT_RTNL();
5003
5004         features = netdev_get_wanted_features(dev);
5005
5006         if (dev->netdev_ops->ndo_fix_features)
5007                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5008
5009         /* driver might be less strict about feature dependencies */
5010         features = netdev_fix_features(dev, features);
5011
5012         if (dev->features == features)
5013                 return 0;
5014
5015         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5016                 &dev->features, &features);
5017
5018         if (dev->netdev_ops->ndo_set_features)
5019                 err = dev->netdev_ops->ndo_set_features(dev, features);
5020
5021         if (unlikely(err < 0)) {
5022                 netdev_err(dev,
5023                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5024                         err, &features, &dev->features);
5025                 return -1;
5026         }
5027
5028         if (!err)
5029                 dev->features = features;
5030
5031         return 1;
5032 }
5033
5034 /**
5035  *      netdev_update_features - recalculate device features
5036  *      @dev: the device to check
5037  *
5038  *      Recalculate dev->features set and send notifications if it
5039  *      has changed. Should be called after driver or hardware dependent
5040  *      conditions might have changed that influence the features.
5041  */
5042 void netdev_update_features(struct net_device *dev)
5043 {
5044         if (__netdev_update_features(dev))
5045                 netdev_features_change(dev);
5046 }
5047 EXPORT_SYMBOL(netdev_update_features);
5048
5049 /**
5050  *      netdev_change_features - recalculate device features
5051  *      @dev: the device to check
5052  *
5053  *      Recalculate dev->features set and send notifications even
5054  *      if they have not changed. Should be called instead of
5055  *      netdev_update_features() if also dev->vlan_features might
5056  *      have changed to allow the changes to be propagated to stacked
5057  *      VLAN devices.
5058  */
5059 void netdev_change_features(struct net_device *dev)
5060 {
5061         __netdev_update_features(dev);
5062         netdev_features_change(dev);
5063 }
5064 EXPORT_SYMBOL(netdev_change_features);
5065
5066 /**
5067  *      netif_stacked_transfer_operstate -      transfer operstate
5068  *      @rootdev: the root or lower level device to transfer state from
5069  *      @dev: the device to transfer operstate to
5070  *
5071  *      Transfer operational state from root to device. This is normally
5072  *      called when a stacking relationship exists between the root
5073  *      device and the device(a leaf device).
5074  */
5075 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5076                                         struct net_device *dev)
5077 {
5078         if (rootdev->operstate == IF_OPER_DORMANT)
5079                 netif_dormant_on(dev);
5080         else
5081                 netif_dormant_off(dev);
5082
5083         if (netif_carrier_ok(rootdev)) {
5084                 if (!netif_carrier_ok(dev))
5085                         netif_carrier_on(dev);
5086         } else {
5087                 if (netif_carrier_ok(dev))
5088                         netif_carrier_off(dev);
5089         }
5090 }
5091 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5092
5093 #ifdef CONFIG_RPS
5094 static int netif_alloc_rx_queues(struct net_device *dev)
5095 {
5096         unsigned int i, count = dev->num_rx_queues;
5097         struct netdev_rx_queue *rx;
5098
5099         BUG_ON(count < 1);
5100
5101         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5102         if (!rx)
5103                 return -ENOMEM;
5104
5105         dev->_rx = rx;
5106
5107         for (i = 0; i < count; i++)
5108                 rx[i].dev = dev;
5109         return 0;
5110 }
5111 #endif
5112
5113 static void netdev_init_one_queue(struct net_device *dev,
5114                                   struct netdev_queue *queue, void *_unused)
5115 {
5116         /* Initialize queue lock */
5117         spin_lock_init(&queue->_xmit_lock);
5118         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5119         queue->xmit_lock_owner = -1;
5120         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5121         queue->dev = dev;
5122 #ifdef CONFIG_BQL
5123         dql_init(&queue->dql, HZ);
5124 #endif
5125 }
5126
5127 static int netif_alloc_netdev_queues(struct net_device *dev)
5128 {
5129         unsigned int count = dev->num_tx_queues;
5130         struct netdev_queue *tx;
5131
5132         BUG_ON(count < 1);
5133
5134         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5135         if (!tx)
5136                 return -ENOMEM;
5137
5138         dev->_tx = tx;
5139
5140         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5141         spin_lock_init(&dev->tx_global_lock);
5142
5143         return 0;
5144 }
5145
5146 /**
5147  *      register_netdevice      - register a network device
5148  *      @dev: device to register
5149  *
5150  *      Take a completed network device structure and add it to the kernel
5151  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5152  *      chain. 0 is returned on success. A negative errno code is returned
5153  *      on a failure to set up the device, or if the name is a duplicate.
5154  *
5155  *      Callers must hold the rtnl semaphore. You may want
5156  *      register_netdev() instead of this.
5157  *
5158  *      BUGS:
5159  *      The locking appears insufficient to guarantee two parallel registers
5160  *      will not get the same name.
5161  */
5162
5163 int register_netdevice(struct net_device *dev)
5164 {
5165         int ret;
5166         struct net *net = dev_net(dev);
5167
5168         BUG_ON(dev_boot_phase);
5169         ASSERT_RTNL();
5170
5171         might_sleep();
5172
5173         /* When net_device's are persistent, this will be fatal. */
5174         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5175         BUG_ON(!net);
5176
5177         spin_lock_init(&dev->addr_list_lock);
5178         netdev_set_addr_lockdep_class(dev);
5179
5180         dev->iflink = -1;
5181
5182         ret = dev_get_valid_name(net, dev, dev->name);
5183         if (ret < 0)
5184                 goto out;
5185
5186         /* Init, if this function is available */
5187         if (dev->netdev_ops->ndo_init) {
5188                 ret = dev->netdev_ops->ndo_init(dev);
5189                 if (ret) {
5190                         if (ret > 0)
5191                                 ret = -EIO;
5192                         goto out;
5193                 }
5194         }
5195
5196         if (((dev->hw_features | dev->features) &
5197              NETIF_F_HW_VLAN_CTAG_FILTER) &&
5198             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5199              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5200                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5201                 ret = -EINVAL;
5202                 goto err_uninit;
5203         }
5204
5205         ret = -EBUSY;
5206         if (!dev->ifindex)
5207                 dev->ifindex = dev_new_index(net);
5208         else if (__dev_get_by_index(net, dev->ifindex))
5209                 goto err_uninit;
5210
5211         if (dev->iflink == -1)
5212                 dev->iflink = dev->ifindex;
5213
5214         /* Transfer changeable features to wanted_features and enable
5215          * software offloads (GSO and GRO).
5216          */
5217         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5218         dev->features |= NETIF_F_SOFT_FEATURES;
5219         dev->wanted_features = dev->features & dev->hw_features;
5220
5221         /* Turn on no cache copy if HW is doing checksum */
5222         if (!(dev->flags & IFF_LOOPBACK)) {
5223                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5224                 if (dev->features & NETIF_F_ALL_CSUM) {
5225                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5226                         dev->features |= NETIF_F_NOCACHE_COPY;
5227                 }
5228         }
5229
5230         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5231          */
5232         dev->vlan_features |= NETIF_F_HIGHDMA;
5233
5234         /* Make NETIF_F_SG inheritable to tunnel devices.
5235          */
5236         dev->hw_enc_features |= NETIF_F_SG;
5237
5238         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5239         ret = notifier_to_errno(ret);
5240         if (ret)
5241                 goto err_uninit;
5242
5243         ret = netdev_register_kobject(dev);
5244         if (ret)
5245                 goto err_uninit;
5246         dev->reg_state = NETREG_REGISTERED;
5247
5248         __netdev_update_features(dev);
5249
5250         /*
5251          *      Default initial state at registry is that the
5252          *      device is present.
5253          */
5254
5255         set_bit(__LINK_STATE_PRESENT, &dev->state);
5256
5257         linkwatch_init_dev(dev);
5258
5259         dev_init_scheduler(dev);
5260         dev_hold(dev);
5261         list_netdevice(dev);
5262         add_device_randomness(dev->dev_addr, dev->addr_len);
5263
5264         /* If the device has permanent device address, driver should
5265          * set dev_addr and also addr_assign_type should be set to
5266          * NET_ADDR_PERM (default value).
5267          */
5268         if (dev->addr_assign_type == NET_ADDR_PERM)
5269                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5270
5271         /* Notify protocols, that a new device appeared. */
5272         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5273         ret = notifier_to_errno(ret);
5274         if (ret) {
5275                 rollback_registered(dev);
5276                 dev->reg_state = NETREG_UNREGISTERED;
5277         }
5278         /*
5279          *      Prevent userspace races by waiting until the network
5280          *      device is fully setup before sending notifications.
5281          */
5282         if (!dev->rtnl_link_ops ||
5283             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5284                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5285
5286 out:
5287         return ret;
5288
5289 err_uninit:
5290         if (dev->netdev_ops->ndo_uninit)
5291                 dev->netdev_ops->ndo_uninit(dev);
5292         goto out;
5293 }
5294 EXPORT_SYMBOL(register_netdevice);
5295
5296 /**
5297  *      init_dummy_netdev       - init a dummy network device for NAPI
5298  *      @dev: device to init
5299  *
5300  *      This takes a network device structure and initialize the minimum
5301  *      amount of fields so it can be used to schedule NAPI polls without
5302  *      registering a full blown interface. This is to be used by drivers
5303  *      that need to tie several hardware interfaces to a single NAPI
5304  *      poll scheduler due to HW limitations.
5305  */
5306 int init_dummy_netdev(struct net_device *dev)
5307 {
5308         /* Clear everything. Note we don't initialize spinlocks
5309          * are they aren't supposed to be taken by any of the
5310          * NAPI code and this dummy netdev is supposed to be
5311          * only ever used for NAPI polls
5312          */
5313         memset(dev, 0, sizeof(struct net_device));
5314
5315         /* make sure we BUG if trying to hit standard
5316          * register/unregister code path
5317          */
5318         dev->reg_state = NETREG_DUMMY;
5319
5320         /* NAPI wants this */
5321         INIT_LIST_HEAD(&dev->napi_list);
5322
5323         /* a dummy interface is started by default */
5324         set_bit(__LINK_STATE_PRESENT, &dev->state);
5325         set_bit(__LINK_STATE_START, &dev->state);
5326
5327         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5328          * because users of this 'device' dont need to change
5329          * its refcount.
5330          */
5331
5332         return 0;
5333 }
5334 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5335
5336
5337 /**
5338  *      register_netdev - register a network device
5339  *      @dev: device to register
5340  *
5341  *      Take a completed network device structure and add it to the kernel
5342  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5343  *      chain. 0 is returned on success. A negative errno code is returned
5344  *      on a failure to set up the device, or if the name is a duplicate.
5345  *
5346  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5347  *      and expands the device name if you passed a format string to
5348  *      alloc_netdev.
5349  */
5350 int register_netdev(struct net_device *dev)
5351 {
5352         int err;
5353
5354         rtnl_lock();
5355         err = register_netdevice(dev);
5356         rtnl_unlock();
5357         return err;
5358 }
5359 EXPORT_SYMBOL(register_netdev);
5360
5361 int netdev_refcnt_read(const struct net_device *dev)
5362 {
5363         int i, refcnt = 0;
5364
5365         for_each_possible_cpu(i)
5366                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5367         return refcnt;
5368 }
5369 EXPORT_SYMBOL(netdev_refcnt_read);
5370
5371 /**
5372  * netdev_wait_allrefs - wait until all references are gone.
5373  * @dev: target net_device
5374  *
5375  * This is called when unregistering network devices.
5376  *
5377  * Any protocol or device that holds a reference should register
5378  * for netdevice notification, and cleanup and put back the
5379  * reference if they receive an UNREGISTER event.
5380  * We can get stuck here if buggy protocols don't correctly
5381  * call dev_put.
5382  */
5383 static void netdev_wait_allrefs(struct net_device *dev)
5384 {
5385         unsigned long rebroadcast_time, warning_time;
5386         int refcnt;
5387
5388         linkwatch_forget_dev(dev);
5389
5390         rebroadcast_time = warning_time = jiffies;
5391         refcnt = netdev_refcnt_read(dev);
5392
5393         while (refcnt != 0) {
5394                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5395                         rtnl_lock();
5396
5397                         /* Rebroadcast unregister notification */
5398                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5399
5400                         __rtnl_unlock();
5401                         rcu_barrier();
5402                         rtnl_lock();
5403
5404                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5405                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5406                                      &dev->state)) {
5407                                 /* We must not have linkwatch events
5408                                  * pending on unregister. If this
5409                                  * happens, we simply run the queue
5410                                  * unscheduled, resulting in a noop
5411                                  * for this device.
5412                                  */
5413                                 linkwatch_run_queue();
5414                         }
5415
5416                         __rtnl_unlock();
5417
5418                         rebroadcast_time = jiffies;
5419                 }
5420
5421                 msleep(250);
5422
5423                 refcnt = netdev_refcnt_read(dev);
5424
5425                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5426                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5427                                  dev->name, refcnt);
5428                         warning_time = jiffies;
5429                 }
5430         }
5431 }
5432
5433 /* The sequence is:
5434  *
5435  *      rtnl_lock();
5436  *      ...
5437  *      register_netdevice(x1);
5438  *      register_netdevice(x2);
5439  *      ...
5440  *      unregister_netdevice(y1);
5441  *      unregister_netdevice(y2);
5442  *      ...
5443  *      rtnl_unlock();
5444  *      free_netdev(y1);
5445  *      free_netdev(y2);
5446  *
5447  * We are invoked by rtnl_unlock().
5448  * This allows us to deal with problems:
5449  * 1) We can delete sysfs objects which invoke hotplug
5450  *    without deadlocking with linkwatch via keventd.
5451  * 2) Since we run with the RTNL semaphore not held, we can sleep
5452  *    safely in order to wait for the netdev refcnt to drop to zero.
5453  *
5454  * We must not return until all unregister events added during
5455  * the interval the lock was held have been completed.
5456  */
5457 void netdev_run_todo(void)
5458 {
5459         struct list_head list;
5460
5461         /* Snapshot list, allow later requests */
5462         list_replace_init(&net_todo_list, &list);
5463
5464         __rtnl_unlock();
5465
5466
5467         /* Wait for rcu callbacks to finish before next phase */
5468         if (!list_empty(&list))
5469                 rcu_barrier();
5470
5471         while (!list_empty(&list)) {
5472                 struct net_device *dev
5473                         = list_first_entry(&list, struct net_device, todo_list);
5474                 list_del(&dev->todo_list);
5475
5476                 rtnl_lock();
5477                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5478                 __rtnl_unlock();
5479
5480                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5481                         pr_err("network todo '%s' but state %d\n",
5482                                dev->name, dev->reg_state);
5483                         dump_stack();
5484                         continue;
5485                 }
5486
5487                 dev->reg_state = NETREG_UNREGISTERED;
5488
5489                 on_each_cpu(flush_backlog, dev, 1);
5490
5491                 netdev_wait_allrefs(dev);
5492
5493                 /* paranoia */
5494                 BUG_ON(netdev_refcnt_read(dev));
5495                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5496                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5497                 WARN_ON(dev->dn_ptr);
5498
5499                 if (dev->destructor)
5500                         dev->destructor(dev);
5501
5502                 /* Free network device */
5503                 kobject_put(&dev->dev.kobj);
5504         }
5505 }
5506
5507 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5508  * fields in the same order, with only the type differing.
5509  */
5510 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5511                              const struct net_device_stats *netdev_stats)
5512 {
5513 #if BITS_PER_LONG == 64
5514         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5515         memcpy(stats64, netdev_stats, sizeof(*stats64));
5516 #else
5517         size_t i, n = sizeof(*stats64) / sizeof(u64);
5518         const unsigned long *src = (const unsigned long *)netdev_stats;
5519         u64 *dst = (u64 *)stats64;
5520
5521         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5522                      sizeof(*stats64) / sizeof(u64));
5523         for (i = 0; i < n; i++)
5524                 dst[i] = src[i];
5525 #endif
5526 }
5527 EXPORT_SYMBOL(netdev_stats_to_stats64);
5528
5529 /**
5530  *      dev_get_stats   - get network device statistics
5531  *      @dev: device to get statistics from
5532  *      @storage: place to store stats
5533  *
5534  *      Get network statistics from device. Return @storage.
5535  *      The device driver may provide its own method by setting
5536  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5537  *      otherwise the internal statistics structure is used.
5538  */
5539 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5540                                         struct rtnl_link_stats64 *storage)
5541 {
5542         const struct net_device_ops *ops = dev->netdev_ops;
5543
5544         if (ops->ndo_get_stats64) {
5545                 memset(storage, 0, sizeof(*storage));
5546                 ops->ndo_get_stats64(dev, storage);
5547         } else if (ops->ndo_get_stats) {
5548                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5549         } else {
5550                 netdev_stats_to_stats64(storage, &dev->stats);
5551         }
5552         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5553         return storage;
5554 }
5555 EXPORT_SYMBOL(dev_get_stats);
5556
5557 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5558 {
5559         struct netdev_queue *queue = dev_ingress_queue(dev);
5560
5561 #ifdef CONFIG_NET_CLS_ACT
5562         if (queue)
5563                 return queue;
5564         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5565         if (!queue)
5566                 return NULL;
5567         netdev_init_one_queue(dev, queue, NULL);
5568         queue->qdisc = &noop_qdisc;
5569         queue->qdisc_sleeping = &noop_qdisc;
5570         rcu_assign_pointer(dev->ingress_queue, queue);
5571 #endif
5572         return queue;
5573 }
5574
5575 static const struct ethtool_ops default_ethtool_ops;
5576
5577 void netdev_set_default_ethtool_ops(struct net_device *dev,
5578                                     const struct ethtool_ops *ops)
5579 {
5580         if (dev->ethtool_ops == &default_ethtool_ops)
5581                 dev->ethtool_ops = ops;
5582 }
5583 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5584
5585 /**
5586  *      alloc_netdev_mqs - allocate network device
5587  *      @sizeof_priv:   size of private data to allocate space for
5588  *      @name:          device name format string
5589  *      @setup:         callback to initialize device
5590  *      @txqs:          the number of TX subqueues to allocate
5591  *      @rxqs:          the number of RX subqueues to allocate
5592  *
5593  *      Allocates a struct net_device with private data area for driver use
5594  *      and performs basic initialization.  Also allocates subquue structs
5595  *      for each queue on the device.
5596  */
5597 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5598                 void (*setup)(struct net_device *),
5599                 unsigned int txqs, unsigned int rxqs)
5600 {
5601         struct net_device *dev;
5602         size_t alloc_size;
5603         struct net_device *p;
5604
5605         BUG_ON(strlen(name) >= sizeof(dev->name));
5606
5607         if (txqs < 1) {
5608                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5609                 return NULL;
5610         }
5611
5612 #ifdef CONFIG_RPS
5613         if (rxqs < 1) {
5614                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5615                 return NULL;
5616         }
5617 #endif
5618
5619         alloc_size = sizeof(struct net_device);
5620         if (sizeof_priv) {
5621                 /* ensure 32-byte alignment of private area */
5622                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5623                 alloc_size += sizeof_priv;
5624         }
5625         /* ensure 32-byte alignment of whole construct */
5626         alloc_size += NETDEV_ALIGN - 1;
5627
5628         p = kzalloc(alloc_size, GFP_KERNEL);
5629         if (!p)
5630                 return NULL;
5631
5632         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5633         dev->padded = (char *)dev - (char *)p;
5634
5635         dev->pcpu_refcnt = alloc_percpu(int);
5636         if (!dev->pcpu_refcnt)
5637                 goto free_p;
5638
5639         if (dev_addr_init(dev))
5640                 goto free_pcpu;
5641
5642         dev_mc_init(dev);
5643         dev_uc_init(dev);
5644
5645         dev_net_set(dev, &init_net);
5646
5647         dev->gso_max_size = GSO_MAX_SIZE;
5648         dev->gso_max_segs = GSO_MAX_SEGS;
5649
5650         INIT_LIST_HEAD(&dev->napi_list);
5651         INIT_LIST_HEAD(&dev->unreg_list);
5652         INIT_LIST_HEAD(&dev->link_watch_list);
5653         INIT_LIST_HEAD(&dev->upper_dev_list);
5654         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5655         setup(dev);
5656
5657         dev->num_tx_queues = txqs;
5658         dev->real_num_tx_queues = txqs;
5659         if (netif_alloc_netdev_queues(dev))
5660                 goto free_all;
5661
5662 #ifdef CONFIG_RPS
5663         dev->num_rx_queues = rxqs;
5664         dev->real_num_rx_queues = rxqs;
5665         if (netif_alloc_rx_queues(dev))
5666                 goto free_all;
5667 #endif
5668
5669         strcpy(dev->name, name);
5670         dev->group = INIT_NETDEV_GROUP;
5671         if (!dev->ethtool_ops)
5672                 dev->ethtool_ops = &default_ethtool_ops;
5673         return dev;
5674
5675 free_all:
5676         free_netdev(dev);
5677         return NULL;
5678
5679 free_pcpu:
5680         free_percpu(dev->pcpu_refcnt);
5681         kfree(dev->_tx);
5682 #ifdef CONFIG_RPS
5683         kfree(dev->_rx);
5684 #endif
5685
5686 free_p:
5687         kfree(p);
5688         return NULL;
5689 }
5690 EXPORT_SYMBOL(alloc_netdev_mqs);
5691
5692 /**
5693  *      free_netdev - free network device
5694  *      @dev: device
5695  *
5696  *      This function does the last stage of destroying an allocated device
5697  *      interface. The reference to the device object is released.
5698  *      If this is the last reference then it will be freed.
5699  */
5700 void free_netdev(struct net_device *dev)
5701 {
5702         struct napi_struct *p, *n;
5703
5704         release_net(dev_net(dev));
5705
5706         kfree(dev->_tx);
5707 #ifdef CONFIG_RPS
5708         kfree(dev->_rx);
5709 #endif
5710
5711         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5712
5713         /* Flush device addresses */
5714         dev_addr_flush(dev);
5715
5716         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5717                 netif_napi_del(p);
5718
5719         free_percpu(dev->pcpu_refcnt);
5720         dev->pcpu_refcnt = NULL;
5721
5722         /*  Compatibility with error handling in drivers */
5723         if (dev->reg_state == NETREG_UNINITIALIZED) {
5724                 kfree((char *)dev - dev->padded);
5725                 return;
5726         }
5727
5728         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5729         dev->reg_state = NETREG_RELEASED;
5730
5731         /* will free via device release */
5732         put_device(&dev->dev);
5733 }
5734 EXPORT_SYMBOL(free_netdev);
5735
5736 /**
5737  *      synchronize_net -  Synchronize with packet receive processing
5738  *
5739  *      Wait for packets currently being received to be done.
5740  *      Does not block later packets from starting.
5741  */
5742 void synchronize_net(void)
5743 {
5744         might_sleep();
5745         if (rtnl_is_locked())
5746                 synchronize_rcu_expedited();
5747         else
5748                 synchronize_rcu();
5749 }
5750 EXPORT_SYMBOL(synchronize_net);
5751
5752 /**
5753  *      unregister_netdevice_queue - remove device from the kernel
5754  *      @dev: device
5755  *      @head: list
5756  *
5757  *      This function shuts down a device interface and removes it
5758  *      from the kernel tables.
5759  *      If head not NULL, device is queued to be unregistered later.
5760  *
5761  *      Callers must hold the rtnl semaphore.  You may want
5762  *      unregister_netdev() instead of this.
5763  */
5764
5765 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5766 {
5767         ASSERT_RTNL();
5768
5769         if (head) {
5770                 list_move_tail(&dev->unreg_list, head);
5771         } else {
5772                 rollback_registered(dev);
5773                 /* Finish processing unregister after unlock */
5774                 net_set_todo(dev);
5775         }
5776 }
5777 EXPORT_SYMBOL(unregister_netdevice_queue);
5778
5779 /**
5780  *      unregister_netdevice_many - unregister many devices
5781  *      @head: list of devices
5782  */
5783 void unregister_netdevice_many(struct list_head *head)
5784 {
5785         struct net_device *dev;
5786
5787         if (!list_empty(head)) {
5788                 rollback_registered_many(head);
5789                 list_for_each_entry(dev, head, unreg_list)
5790                         net_set_todo(dev);
5791         }
5792 }
5793 EXPORT_SYMBOL(unregister_netdevice_many);
5794
5795 /**
5796  *      unregister_netdev - remove device from the kernel
5797  *      @dev: device
5798  *
5799  *      This function shuts down a device interface and removes it
5800  *      from the kernel tables.
5801  *
5802  *      This is just a wrapper for unregister_netdevice that takes
5803  *      the rtnl semaphore.  In general you want to use this and not
5804  *      unregister_netdevice.
5805  */
5806 void unregister_netdev(struct net_device *dev)
5807 {
5808         rtnl_lock();
5809         unregister_netdevice(dev);
5810         rtnl_unlock();
5811 }
5812 EXPORT_SYMBOL(unregister_netdev);
5813
5814 /**
5815  *      dev_change_net_namespace - move device to different nethost namespace
5816  *      @dev: device
5817  *      @net: network namespace
5818  *      @pat: If not NULL name pattern to try if the current device name
5819  *            is already taken in the destination network namespace.
5820  *
5821  *      This function shuts down a device interface and moves it
5822  *      to a new network namespace. On success 0 is returned, on
5823  *      a failure a netagive errno code is returned.
5824  *
5825  *      Callers must hold the rtnl semaphore.
5826  */
5827
5828 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5829 {
5830         int err;
5831
5832         ASSERT_RTNL();
5833
5834         /* Don't allow namespace local devices to be moved. */
5835         err = -EINVAL;
5836         if (dev->features & NETIF_F_NETNS_LOCAL)
5837                 goto out;
5838
5839         /* Ensure the device has been registrered */
5840         if (dev->reg_state != NETREG_REGISTERED)
5841                 goto out;
5842
5843         /* Get out if there is nothing todo */
5844         err = 0;
5845         if (net_eq(dev_net(dev), net))
5846                 goto out;
5847
5848         /* Pick the destination device name, and ensure
5849          * we can use it in the destination network namespace.
5850          */
5851         err = -EEXIST;
5852         if (__dev_get_by_name(net, dev->name)) {
5853                 /* We get here if we can't use the current device name */
5854                 if (!pat)
5855                         goto out;
5856                 if (dev_get_valid_name(net, dev, pat) < 0)
5857                         goto out;
5858         }
5859
5860         /*
5861          * And now a mini version of register_netdevice unregister_netdevice.
5862          */
5863
5864         /* If device is running close it first. */
5865         dev_close(dev);
5866
5867         /* And unlink it from device chain */
5868         err = -ENODEV;
5869         unlist_netdevice(dev);
5870
5871         synchronize_net();
5872
5873         /* Shutdown queueing discipline. */
5874         dev_shutdown(dev);
5875
5876         /* Notify protocols, that we are about to destroy
5877            this device. They should clean all the things.
5878
5879            Note that dev->reg_state stays at NETREG_REGISTERED.
5880            This is wanted because this way 8021q and macvlan know
5881            the device is just moving and can keep their slaves up.
5882         */
5883         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5884         rcu_barrier();
5885         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5886         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5887
5888         /*
5889          *      Flush the unicast and multicast chains
5890          */
5891         dev_uc_flush(dev);
5892         dev_mc_flush(dev);
5893
5894         /* Send a netdev-removed uevent to the old namespace */
5895         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
5896
5897         /* Actually switch the network namespace */
5898         dev_net_set(dev, net);
5899
5900         /* If there is an ifindex conflict assign a new one */
5901         if (__dev_get_by_index(net, dev->ifindex)) {
5902                 int iflink = (dev->iflink == dev->ifindex);
5903                 dev->ifindex = dev_new_index(net);
5904                 if (iflink)
5905                         dev->iflink = dev->ifindex;
5906         }
5907
5908         /* Send a netdev-add uevent to the new namespace */
5909         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
5910
5911         /* Fixup kobjects */
5912         err = device_rename(&dev->dev, dev->name);
5913         WARN_ON(err);
5914
5915         /* Add the device back in the hashes */
5916         list_netdevice(dev);
5917
5918         /* Notify protocols, that a new device appeared. */
5919         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5920
5921         /*
5922          *      Prevent userspace races by waiting until the network
5923          *      device is fully setup before sending notifications.
5924          */
5925         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5926
5927         synchronize_net();
5928         err = 0;
5929 out:
5930         return err;
5931 }
5932 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5933
5934 static int dev_cpu_callback(struct notifier_block *nfb,
5935                             unsigned long action,
5936                             void *ocpu)
5937 {
5938         struct sk_buff **list_skb;
5939         struct sk_buff *skb;
5940         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5941         struct softnet_data *sd, *oldsd;
5942
5943         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5944                 return NOTIFY_OK;
5945
5946         local_irq_disable();
5947         cpu = smp_processor_id();
5948         sd = &per_cpu(softnet_data, cpu);
5949         oldsd = &per_cpu(softnet_data, oldcpu);
5950
5951         /* Find end of our completion_queue. */
5952         list_skb = &sd->completion_queue;
5953         while (*list_skb)
5954                 list_skb = &(*list_skb)->next;
5955         /* Append completion queue from offline CPU. */
5956         *list_skb = oldsd->completion_queue;
5957         oldsd->completion_queue = NULL;
5958
5959         /* Append output queue from offline CPU. */
5960         if (oldsd->output_queue) {
5961                 *sd->output_queue_tailp = oldsd->output_queue;
5962                 sd->output_queue_tailp = oldsd->output_queue_tailp;
5963                 oldsd->output_queue = NULL;
5964                 oldsd->output_queue_tailp = &oldsd->output_queue;
5965         }
5966         /* Append NAPI poll list from offline CPU. */
5967         if (!list_empty(&oldsd->poll_list)) {
5968                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
5969                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
5970         }
5971
5972         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5973         local_irq_enable();
5974
5975         /* Process offline CPU's input_pkt_queue */
5976         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5977                 netif_rx(skb);
5978                 input_queue_head_incr(oldsd);
5979         }
5980         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5981                 netif_rx(skb);
5982                 input_queue_head_incr(oldsd);
5983         }
5984
5985         return NOTIFY_OK;
5986 }
5987
5988
5989 /**
5990  *      netdev_increment_features - increment feature set by one
5991  *      @all: current feature set
5992  *      @one: new feature set
5993  *      @mask: mask feature set
5994  *
5995  *      Computes a new feature set after adding a device with feature set
5996  *      @one to the master device with current feature set @all.  Will not
5997  *      enable anything that is off in @mask. Returns the new feature set.
5998  */
5999 netdev_features_t netdev_increment_features(netdev_features_t all,
6000         netdev_features_t one, netdev_features_t mask)
6001 {
6002         if (mask & NETIF_F_GEN_CSUM)
6003                 mask |= NETIF_F_ALL_CSUM;
6004         mask |= NETIF_F_VLAN_CHALLENGED;
6005
6006         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6007         all &= one | ~NETIF_F_ALL_FOR_ALL;
6008
6009         /* If one device supports hw checksumming, set for all. */
6010         if (all & NETIF_F_GEN_CSUM)
6011                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6012
6013         return all;
6014 }
6015 EXPORT_SYMBOL(netdev_increment_features);
6016
6017 static struct hlist_head *netdev_create_hash(void)
6018 {
6019         int i;
6020         struct hlist_head *hash;
6021
6022         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6023         if (hash != NULL)
6024                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6025                         INIT_HLIST_HEAD(&hash[i]);
6026
6027         return hash;
6028 }
6029
6030 /* Initialize per network namespace state */
6031 static int __net_init netdev_init(struct net *net)
6032 {
6033         if (net != &init_net)
6034                 INIT_LIST_HEAD(&net->dev_base_head);
6035
6036         net->dev_name_head = netdev_create_hash();
6037         if (net->dev_name_head == NULL)
6038                 goto err_name;
6039
6040         net->dev_index_head = netdev_create_hash();
6041         if (net->dev_index_head == NULL)
6042                 goto err_idx;
6043
6044         return 0;
6045
6046 err_idx:
6047         kfree(net->dev_name_head);
6048 err_name:
6049         return -ENOMEM;
6050 }
6051
6052 /**
6053  *      netdev_drivername - network driver for the device
6054  *      @dev: network device
6055  *
6056  *      Determine network driver for device.
6057  */
6058 const char *netdev_drivername(const struct net_device *dev)
6059 {
6060         const struct device_driver *driver;
6061         const struct device *parent;
6062         const char *empty = "";
6063
6064         parent = dev->dev.parent;
6065         if (!parent)
6066                 return empty;
6067
6068         driver = parent->driver;
6069         if (driver && driver->name)
6070                 return driver->name;
6071         return empty;
6072 }
6073
6074 static int __netdev_printk(const char *level, const struct net_device *dev,
6075                            struct va_format *vaf)
6076 {
6077         int r;
6078
6079         if (dev && dev->dev.parent) {
6080                 r = dev_printk_emit(level[1] - '0',
6081                                     dev->dev.parent,
6082                                     "%s %s %s: %pV",
6083                                     dev_driver_string(dev->dev.parent),
6084                                     dev_name(dev->dev.parent),
6085                                     netdev_name(dev), vaf);
6086         } else if (dev) {
6087                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6088         } else {
6089                 r = printk("%s(NULL net_device): %pV", level, vaf);
6090         }
6091
6092         return r;
6093 }
6094
6095 int netdev_printk(const char *level, const struct net_device *dev,
6096                   const char *format, ...)
6097 {
6098         struct va_format vaf;
6099         va_list args;
6100         int r;
6101
6102         va_start(args, format);
6103
6104         vaf.fmt = format;
6105         vaf.va = &args;
6106
6107         r = __netdev_printk(level, dev, &vaf);
6108
6109         va_end(args);
6110
6111         return r;
6112 }
6113 EXPORT_SYMBOL(netdev_printk);
6114
6115 #define define_netdev_printk_level(func, level)                 \
6116 int func(const struct net_device *dev, const char *fmt, ...)    \
6117 {                                                               \
6118         int r;                                                  \
6119         struct va_format vaf;                                   \
6120         va_list args;                                           \
6121                                                                 \
6122         va_start(args, fmt);                                    \
6123                                                                 \
6124         vaf.fmt = fmt;                                          \
6125         vaf.va = &args;                                         \
6126                                                                 \
6127         r = __netdev_printk(level, dev, &vaf);                  \
6128                                                                 \
6129         va_end(args);                                           \
6130                                                                 \
6131         return r;                                               \
6132 }                                                               \
6133 EXPORT_SYMBOL(func);
6134
6135 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6136 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6137 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6138 define_netdev_printk_level(netdev_err, KERN_ERR);
6139 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6140 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6141 define_netdev_printk_level(netdev_info, KERN_INFO);
6142
6143 static void __net_exit netdev_exit(struct net *net)
6144 {
6145         kfree(net->dev_name_head);
6146         kfree(net->dev_index_head);
6147 }
6148
6149 static struct pernet_operations __net_initdata netdev_net_ops = {
6150         .init = netdev_init,
6151         .exit = netdev_exit,
6152 };
6153
6154 static void __net_exit default_device_exit(struct net *net)
6155 {
6156         struct net_device *dev, *aux;
6157         /*
6158          * Push all migratable network devices back to the
6159          * initial network namespace
6160          */
6161         rtnl_lock();
6162         for_each_netdev_safe(net, dev, aux) {
6163                 int err;
6164                 char fb_name[IFNAMSIZ];
6165
6166                 /* Ignore unmoveable devices (i.e. loopback) */
6167                 if (dev->features & NETIF_F_NETNS_LOCAL)
6168                         continue;
6169
6170                 /* Leave virtual devices for the generic cleanup */
6171                 if (dev->rtnl_link_ops)
6172                         continue;
6173
6174                 /* Push remaining network devices to init_net */
6175                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6176                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6177                 if (err) {
6178                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6179                                  __func__, dev->name, err);
6180                         BUG();
6181                 }
6182         }
6183         rtnl_unlock();
6184 }
6185
6186 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6187 {
6188         /* At exit all network devices most be removed from a network
6189          * namespace.  Do this in the reverse order of registration.
6190          * Do this across as many network namespaces as possible to
6191          * improve batching efficiency.
6192          */
6193         struct net_device *dev;
6194         struct net *net;
6195         LIST_HEAD(dev_kill_list);
6196
6197         rtnl_lock();
6198         list_for_each_entry(net, net_list, exit_list) {
6199                 for_each_netdev_reverse(net, dev) {
6200                         if (dev->rtnl_link_ops)
6201                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6202                         else
6203                                 unregister_netdevice_queue(dev, &dev_kill_list);
6204                 }
6205         }
6206         unregister_netdevice_many(&dev_kill_list);
6207         list_del(&dev_kill_list);
6208         rtnl_unlock();
6209 }
6210
6211 static struct pernet_operations __net_initdata default_device_ops = {
6212         .exit = default_device_exit,
6213         .exit_batch = default_device_exit_batch,
6214 };
6215
6216 /*
6217  *      Initialize the DEV module. At boot time this walks the device list and
6218  *      unhooks any devices that fail to initialise (normally hardware not
6219  *      present) and leaves us with a valid list of present and active devices.
6220  *
6221  */
6222
6223 /*
6224  *       This is called single threaded during boot, so no need
6225  *       to take the rtnl semaphore.
6226  */
6227 static int __init net_dev_init(void)
6228 {
6229         int i, rc = -ENOMEM;
6230
6231         BUG_ON(!dev_boot_phase);
6232
6233         if (dev_proc_init())
6234                 goto out;
6235
6236         if (netdev_kobject_init())
6237                 goto out;
6238
6239         INIT_LIST_HEAD(&ptype_all);
6240         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6241                 INIT_LIST_HEAD(&ptype_base[i]);
6242
6243         INIT_LIST_HEAD(&offload_base);
6244
6245         if (register_pernet_subsys(&netdev_net_ops))
6246                 goto out;
6247
6248         /*
6249          *      Initialise the packet receive queues.
6250          */
6251
6252         for_each_possible_cpu(i) {
6253                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6254
6255                 memset(sd, 0, sizeof(*sd));
6256                 skb_queue_head_init(&sd->input_pkt_queue);
6257                 skb_queue_head_init(&sd->process_queue);
6258                 sd->completion_queue = NULL;
6259                 INIT_LIST_HEAD(&sd->poll_list);
6260                 sd->output_queue = NULL;
6261                 sd->output_queue_tailp = &sd->output_queue;
6262 #ifdef CONFIG_RPS
6263                 sd->csd.func = rps_trigger_softirq;
6264                 sd->csd.info = sd;
6265                 sd->csd.flags = 0;
6266                 sd->cpu = i;
6267 #endif
6268
6269                 sd->backlog.poll = process_backlog;
6270                 sd->backlog.weight = weight_p;
6271                 sd->backlog.gro_list = NULL;
6272                 sd->backlog.gro_count = 0;
6273         }
6274
6275         dev_boot_phase = 0;
6276
6277         /* The loopback device is special if any other network devices
6278          * is present in a network namespace the loopback device must
6279          * be present. Since we now dynamically allocate and free the
6280          * loopback device ensure this invariant is maintained by
6281          * keeping the loopback device as the first device on the
6282          * list of network devices.  Ensuring the loopback devices
6283          * is the first device that appears and the last network device
6284          * that disappears.
6285          */
6286         if (register_pernet_device(&loopback_net_ops))
6287                 goto out;
6288
6289         if (register_pernet_device(&default_device_ops))
6290                 goto out;
6291
6292         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6293         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6294
6295         hotcpu_notifier(dev_cpu_callback, 0);
6296         dst_init();
6297         rc = 0;
6298 out:
6299         return rc;
6300 }
6301
6302 subsys_initcall(net_dev_init);