net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static DEFINE_SPINLOCK(offload_lock);
 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150 struct list_head ptype_all __read_mostly;       /* Taps */
 151 static struct list_head offload_base __read_mostly;
 152
 153 static int netif_rx_internal(struct sk_buff *skb);
 154 static int call_netdevice_notifiers_info(unsigned long val,
 155                                          struct net_device *dev,
 156                                          struct netdev_notifier_info *info);
 157
 158 /*
 159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160  * semaphore.
 161  *
 162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163  *
 164  * Writers must hold the rtnl semaphore while they loop through the
 165  * dev_base_head list, and hold dev_base_lock for writing when they do the
 166  * actual updates.  This allows pure readers to access the list even
 167  * while a writer is preparing to update it.
 168  *
 169  * To put it another way, dev_base_lock is held for writing only to
 170  * protect against pure readers; the rtnl semaphore provides the
 171  * protection against other writers.
 172  *
 173  * See, for example usages, register_netdevice() and
 174  * unregister_netdevice(), which must be called with the rtnl
 175  * semaphore held.
 176  */
 177 DEFINE_RWLOCK(dev_base_lock);
 178 EXPORT_SYMBOL(dev_base_lock);
 179
 180 /* protects napi_hash addition/deletion and napi_gen_id */
 181 static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183 static unsigned int napi_gen_id;
 184 static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186 static seqcount_t devnet_rename_seq;
 187
 188 static inline void dev_base_seq_inc(struct net *net)
 189 {
 190         while (++net->dev_base_seq == 0);
 191 }
 192
 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194 {
 195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198 }
 199
 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201 {
 202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203 }
 204
 205 static inline void rps_lock(struct softnet_data *sd)
 206 {
 207 #ifdef CONFIG_RPS
 208         spin_lock(&sd->input_pkt_queue.lock);
 209 #endif
 210 }
 211
 212 static inline void rps_unlock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_unlock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 /* Device list insertion */
 220 static void list_netdevice(struct net_device *dev)
 221 {
 222         struct net *net = dev_net(dev);
 223
 224         ASSERT_RTNL();
 225
 226         write_lock_bh(&dev_base_lock);
 227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229         hlist_add_head_rcu(&dev->index_hlist,
 230                            dev_index_hash(net, dev->ifindex));
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(net);
 234 }
 235
 236 /* Device list removal
 237  * caller must respect a RCU grace period before freeing/reusing dev
 238  */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del_rcu(&dev->dev_list);
 246         hlist_del_rcu(&dev->name_hlist);
 247         hlist_del_rcu(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(dev_net(dev));
 251 }
 252
 253 /*
 254  *      Our notifier list
 255  */
 256
 257 static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259 /*
 260  *      Device drivers call our routines to queue packets here. We empty the
 261  *      queue in the local softnet handler.
 262  */
 263
 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265 EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267 #ifdef CONFIG_LOCKDEP
 268 /*
 269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270  * according to dev->type
 271  */
 272 static const unsigned short netdev_lock_type[] =
 273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289 static const char *const netdev_lock_name[] =
 290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310 {
 311         int i;
 312
 313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                 if (netdev_lock_type[i] == dev_type)
 315                         return i;
 316         /* the last key is used by default */
 317         return ARRAY_SIZE(netdev_lock_type) - 1;
 318 }
 319
 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                  unsigned short dev_type)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev_type);
 326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329
 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331 {
 332         int i;
 333
 334         i = netdev_lock_pos(dev->type);
 335         lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                    &netdev_addr_lock_key[i],
 337                                    netdev_lock_name[i]);
 338 }
 339 #else
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343 }
 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345 {
 346 }
 347 #endif
 348
 349 /*******************************************************************************
 350
 351                 Protocol management and registration routines
 352
 353 *******************************************************************************/
 354
 355 /*
 356  *      Add a protocol ID to the list. Now that the input handler is
 357  *      smarter we can dispense with all the messy stuff that used to be
 358  *      here.
 359  *
 360  *      BEWARE!!! Protocol handlers, mangling input packets,
 361  *      MUST BE last in hash buckets and checking protocol handlers
 362  *      MUST start from promiscuous ptype_all chain in net_bh.
 363  *      It is true now, do not change it.
 364  *      Explanation follows: if protocol handler, mangling packet, will
 365  *      be the first on list, it is not able to sense, that packet
 366  *      is cloned and should be copied-on-write, so that it will
 367  *      change it and subsequent readers will get broken packet.
 368  *                                                      --ANK (980803)
 369  */
 370
 371 static inline struct list_head *ptype_head(const struct packet_type *pt)
 372 {
 373         if (pt->type == htons(ETH_P_ALL))
 374                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 375         else
 376                 return pt->dev ? &pt->dev->ptype_specific :
 377                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 378 }
 379
 380 /**
 381  *      dev_add_pack - add packet handler
 382  *      @pt: packet type declaration
 383  *
 384  *      Add a protocol handler to the networking stack. The passed &packet_type
 385  *      is linked into kernel lists and may not be freed until it has been
 386  *      removed from the kernel lists.
 387  *
 388  *      This call does not sleep therefore it can not
 389  *      guarantee all CPU's that are in middle of receiving packets
 390  *      will see the new packet type (until the next received packet).
 391  */
 392
 393 void dev_add_pack(struct packet_type *pt)
 394 {
 395         struct list_head *head = ptype_head(pt);
 396
 397         spin_lock(&ptype_lock);
 398         list_add_rcu(&pt->list, head);
 399         spin_unlock(&ptype_lock);
 400 }
 401 EXPORT_SYMBOL(dev_add_pack);
 402
 403 /**
 404  *      __dev_remove_pack        - remove packet handler
 405  *      @pt: packet type declaration
 406  *
 407  *      Remove a protocol handler that was previously added to the kernel
 408  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 409  *      from the kernel lists and can be freed or reused once this function
 410  *      returns.
 411  *
 412  *      The packet type might still be in use by receivers
 413  *      and must not be freed until after all the CPU's have gone
 414  *      through a quiescent state.
 415  */
 416 void __dev_remove_pack(struct packet_type *pt)
 417 {
 418         struct list_head *head = ptype_head(pt);
 419         struct packet_type *pt1;
 420
 421         spin_lock(&ptype_lock);
 422
 423         list_for_each_entry(pt1, head, list) {
 424                 if (pt == pt1) {
 425                         list_del_rcu(&pt->list);
 426                         goto out;
 427                 }
 428         }
 429
 430         pr_warn("dev_remove_pack: %p not found\n", pt);
 431 out:
 432         spin_unlock(&ptype_lock);
 433 }
 434 EXPORT_SYMBOL(__dev_remove_pack);
 435
 436 /**
 437  *      dev_remove_pack  - remove packet handler
 438  *      @pt: packet type declaration
 439  *
 440  *      Remove a protocol handler that was previously added to the kernel
 441  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 442  *      from the kernel lists and can be freed or reused once this function
 443  *      returns.
 444  *
 445  *      This call sleeps to guarantee that no CPU is looking at the packet
 446  *      type after return.
 447  */
 448 void dev_remove_pack(struct packet_type *pt)
 449 {
 450         __dev_remove_pack(pt);
 451
 452         synchronize_net();
 453 }
 454 EXPORT_SYMBOL(dev_remove_pack);
 455
 456
 457 /**
 458  *      dev_add_offload - register offload handlers
 459  *      @po: protocol offload declaration
 460  *
 461  *      Add protocol offload handlers to the networking stack. The passed
 462  *      &proto_offload is linked into kernel lists and may not be freed until
 463  *      it has been removed from the kernel lists.
 464  *
 465  *      This call does not sleep therefore it can not
 466  *      guarantee all CPU's that are in middle of receiving packets
 467  *      will see the new offload handlers (until the next received packet).
 468  */
 469 void dev_add_offload(struct packet_offload *po)
 470 {
 471         struct list_head *head = &offload_base;
 472
 473         spin_lock(&offload_lock);
 474         list_add_rcu(&po->list, head);
 475         spin_unlock(&offload_lock);
 476 }
 477 EXPORT_SYMBOL(dev_add_offload);
 478
 479 /**
 480  *      __dev_remove_offload     - remove offload handler
 481  *      @po: packet offload declaration
 482  *
 483  *      Remove a protocol offload handler that was previously added to the
 484  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 485  *      is removed from the kernel lists and can be freed or reused once this
 486  *      function returns.
 487  *
 488  *      The packet type might still be in use by receivers
 489  *      and must not be freed until after all the CPU's have gone
 490  *      through a quiescent state.
 491  */
 492 static void __dev_remove_offload(struct packet_offload *po)
 493 {
 494         struct list_head *head = &offload_base;
 495         struct packet_offload *po1;
 496
 497         spin_lock(&offload_lock);
 498
 499         list_for_each_entry(po1, head, list) {
 500                 if (po == po1) {
 501                         list_del_rcu(&po->list);
 502                         goto out;
 503                 }
 504         }
 505
 506         pr_warn("dev_remove_offload: %p not found\n", po);
 507 out:
 508         spin_unlock(&offload_lock);
 509 }
 510
 511 /**
 512  *      dev_remove_offload       - remove packet offload handler
 513  *      @po: packet offload declaration
 514  *
 515  *      Remove a packet offload handler that was previously added to the kernel
 516  *      offload handlers by dev_add_offload(). The passed &offload_type is
 517  *      removed from the kernel lists and can be freed or reused once this
 518  *      function returns.
 519  *
 520  *      This call sleeps to guarantee that no CPU is looking at the packet
 521  *      type after return.
 522  */
 523 void dev_remove_offload(struct packet_offload *po)
 524 {
 525         __dev_remove_offload(po);
 526
 527         synchronize_net();
 528 }
 529 EXPORT_SYMBOL(dev_remove_offload);
 530
 531 /******************************************************************************
 532
 533                       Device Boot-time Settings Routines
 534
 535 *******************************************************************************/
 536
 537 /* Boot time configuration table */
 538 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 539
 540 /**
 541  *      netdev_boot_setup_add   - add new setup entry
 542  *      @name: name of the device
 543  *      @map: configured settings for the device
 544  *
 545  *      Adds new setup entry to the dev_boot_setup list.  The function
 546  *      returns 0 on error and 1 on success.  This is a generic routine to
 547  *      all netdevices.
 548  */
 549 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 550 {
 551         struct netdev_boot_setup *s;
 552         int i;
 553
 554         s = dev_boot_setup;
 555         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 556                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 557                         memset(s[i].name, 0, sizeof(s[i].name));
 558                         strlcpy(s[i].name, name, IFNAMSIZ);
 559                         memcpy(&s[i].map, map, sizeof(s[i].map));
 560                         break;
 561                 }
 562         }
 563
 564         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 565 }
 566
 567 /**
 568  *      netdev_boot_setup_check - check boot time settings
 569  *      @dev: the netdevice
 570  *
 571  *      Check boot time settings for the device.
 572  *      The found settings are set for the device to be used
 573  *      later in the device probing.
 574  *      Returns 0 if no settings found, 1 if they are.
 575  */
 576 int netdev_boot_setup_check(struct net_device *dev)
 577 {
 578         struct netdev_boot_setup *s = dev_boot_setup;
 579         int i;
 580
 581         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 582                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 583                     !strcmp(dev->name, s[i].name)) {
 584                         dev->irq        = s[i].map.irq;
 585                         dev->base_addr  = s[i].map.base_addr;
 586                         dev->mem_start  = s[i].map.mem_start;
 587                         dev->mem_end    = s[i].map.mem_end;
 588                         return 1;
 589                 }
 590         }
 591         return 0;
 592 }
 593 EXPORT_SYMBOL(netdev_boot_setup_check);
 594
 595
 596 /**
 597  *      netdev_boot_base        - get address from boot time settings
 598  *      @prefix: prefix for network device
 599  *      @unit: id for network device
 600  *
 601  *      Check boot time settings for the base address of device.
 602  *      The found settings are set for the device to be used
 603  *      later in the device probing.
 604  *      Returns 0 if no settings found.
 605  */
 606 unsigned long netdev_boot_base(const char *prefix, int unit)
 607 {
 608         const struct netdev_boot_setup *s = dev_boot_setup;
 609         char name[IFNAMSIZ];
 610         int i;
 611
 612         sprintf(name, "%s%d", prefix, unit);
 613
 614         /*
 615          * If device already registered then return base of 1
 616          * to indicate not to probe for this interface
 617          */
 618         if (__dev_get_by_name(&init_net, name))
 619                 return 1;
 620
 621         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 622                 if (!strcmp(name, s[i].name))
 623                         return s[i].map.base_addr;
 624         return 0;
 625 }
 626
 627 /*
 628  * Saves at boot time configured settings for any netdevice.
 629  */
 630 int __init netdev_boot_setup(char *str)
 631 {
 632         int ints[5];
 633         struct ifmap map;
 634
 635         str = get_options(str, ARRAY_SIZE(ints), ints);
 636         if (!str || !*str)
 637                 return 0;
 638
 639         /* Save settings */
 640         memset(&map, 0, sizeof(map));
 641         if (ints[0] > 0)
 642                 map.irq = ints[1];
 643         if (ints[0] > 1)
 644                 map.base_addr = ints[2];
 645         if (ints[0] > 2)
 646                 map.mem_start = ints[3];
 647         if (ints[0] > 3)
 648                 map.mem_end = ints[4];
 649
 650         /* Add new entry to the list */
 651         return netdev_boot_setup_add(str, &map);
 652 }
 653
 654 __setup("netdev=", netdev_boot_setup);
 655
 656 /*******************************************************************************
 657
 658                             Device Interface Subroutines
 659
 660 *******************************************************************************/
 661
 662 /**
 663  *      dev_get_iflink  - get 'iflink' value of a interface
 664  *      @dev: targeted interface
 665  *
 666  *      Indicates the ifindex the interface is linked to.
 667  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 668  */
 669
 670 int dev_get_iflink(const struct net_device *dev)
 671 {
 672         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 673                 return dev->netdev_ops->ndo_get_iflink(dev);
 674
 675         /* If dev->rtnl_link_ops is set, it's a virtual interface. */
 676         if (dev->rtnl_link_ops)
 677                 return 0;
 678
 679         return dev->ifindex;
 680 }
 681 EXPORT_SYMBOL(dev_get_iflink);
 682
 683 /**
 684  *      __dev_get_by_name       - find a device by its name
 685  *      @net: the applicable net namespace
 686  *      @name: name to find
 687  *
 688  *      Find an interface by name. Must be called under RTNL semaphore
 689  *      or @dev_base_lock. If the name is found a pointer to the device
 690  *      is returned. If the name is not found then %NULL is returned. The
 691  *      reference counters are not incremented so the caller must be
 692  *      careful with locks.
 693  */
 694
 695 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 696 {
 697         struct net_device *dev;
 698         struct hlist_head *head = dev_name_hash(net, name);
 699
 700         hlist_for_each_entry(dev, head, name_hlist)
 701                 if (!strncmp(dev->name, name, IFNAMSIZ))
 702                         return dev;
 703
 704         return NULL;
 705 }
 706 EXPORT_SYMBOL(__dev_get_by_name);
 707
 708 /**
 709  *      dev_get_by_name_rcu     - find a device by its name
 710  *      @net: the applicable net namespace
 711  *      @name: name to find
 712  *
 713  *      Find an interface by name.
 714  *      If the name is found a pointer to the device is returned.
 715  *      If the name is not found then %NULL is returned.
 716  *      The reference counters are not incremented so the caller must be
 717  *      careful with locks. The caller must hold RCU lock.
 718  */
 719
 720 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 721 {
 722         struct net_device *dev;
 723         struct hlist_head *head = dev_name_hash(net, name);
 724
 725         hlist_for_each_entry_rcu(dev, head, name_hlist)
 726                 if (!strncmp(dev->name, name, IFNAMSIZ))
 727                         return dev;
 728
 729         return NULL;
 730 }
 731 EXPORT_SYMBOL(dev_get_by_name_rcu);
 732
 733 /**
 734  *      dev_get_by_name         - find a device by its name
 735  *      @net: the applicable net namespace
 736  *      @name: name to find
 737  *
 738  *      Find an interface by name. This can be called from any
 739  *      context and does its own locking. The returned handle has
 740  *      the usage count incremented and the caller must use dev_put() to
 741  *      release it when it is no longer needed. %NULL is returned if no
 742  *      matching device is found.
 743  */
 744
 745 struct net_device *dev_get_by_name(struct net *net, const char *name)
 746 {
 747         struct net_device *dev;
 748
 749         rcu_read_lock();
 750         dev = dev_get_by_name_rcu(net, name);
 751         if (dev)
 752                 dev_hold(dev);
 753         rcu_read_unlock();
 754         return dev;
 755 }
 756 EXPORT_SYMBOL(dev_get_by_name);
 757
 758 /**
 759  *      __dev_get_by_index - find a device by its ifindex
 760  *      @net: the applicable net namespace
 761  *      @ifindex: index of device
 762  *
 763  *      Search for an interface by index. Returns %NULL if the device
 764  *      is not found or a pointer to the device. The device has not
 765  *      had its reference counter increased so the caller must be careful
 766  *      about locking. The caller must hold either the RTNL semaphore
 767  *      or @dev_base_lock.
 768  */
 769
 770 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 771 {
 772         struct net_device *dev;
 773         struct hlist_head *head = dev_index_hash(net, ifindex);
 774
 775         hlist_for_each_entry(dev, head, index_hlist)
 776                 if (dev->ifindex == ifindex)
 777                         return dev;
 778
 779         return NULL;
 780 }
 781 EXPORT_SYMBOL(__dev_get_by_index);
 782
 783 /**
 784  *      dev_get_by_index_rcu - find a device by its ifindex
 785  *      @net: the applicable net namespace
 786  *      @ifindex: index of device
 787  *
 788  *      Search for an interface by index. Returns %NULL if the device
 789  *      is not found or a pointer to the device. The device has not
 790  *      had its reference counter increased so the caller must be careful
 791  *      about locking. The caller must hold RCU lock.
 792  */
 793
 794 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 795 {
 796         struct net_device *dev;
 797         struct hlist_head *head = dev_index_hash(net, ifindex);
 798
 799         hlist_for_each_entry_rcu(dev, head, index_hlist)
 800                 if (dev->ifindex == ifindex)
 801                         return dev;
 802
 803         return NULL;
 804 }
 805 EXPORT_SYMBOL(dev_get_by_index_rcu);
 806
 807
 808 /**
 809  *      dev_get_by_index - find a device by its ifindex
 810  *      @net: the applicable net namespace
 811  *      @ifindex: index of device
 812  *
 813  *      Search for an interface by index. Returns NULL if the device
 814  *      is not found or a pointer to the device. The device returned has
 815  *      had a reference added and the pointer is safe until the user calls
 816  *      dev_put to indicate they have finished with it.
 817  */
 818
 819 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 820 {
 821         struct net_device *dev;
 822
 823         rcu_read_lock();
 824         dev = dev_get_by_index_rcu(net, ifindex);
 825         if (dev)
 826                 dev_hold(dev);
 827         rcu_read_unlock();
 828         return dev;
 829 }
 830 EXPORT_SYMBOL(dev_get_by_index);
 831
 832 /**
 833  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 834  *      @net: network namespace
 835  *      @name: a pointer to the buffer where the name will be stored.
 836  *      @ifindex: the ifindex of the interface to get the name from.
 837  *
 838  *      The use of raw_seqcount_begin() and cond_resched() before
 839  *      retrying is required as we want to give the writers a chance
 840  *      to complete when CONFIG_PREEMPT is not set.
 841  */
 842 int netdev_get_name(struct net *net, char *name, int ifindex)
 843 {
 844         struct net_device *dev;
 845         unsigned int seq;
 846
 847 retry:
 848         seq = raw_seqcount_begin(&devnet_rename_seq);
 849         rcu_read_lock();
 850         dev = dev_get_by_index_rcu(net, ifindex);
 851         if (!dev) {
 852                 rcu_read_unlock();
 853                 return -ENODEV;
 854         }
 855
 856         strcpy(name, dev->name);
 857         rcu_read_unlock();
 858         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 859                 cond_resched();
 860                 goto retry;
 861         }
 862
 863         return 0;
 864 }
 865
 866 /**
 867  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 868  *      @net: the applicable net namespace
 869  *      @type: media type of device
 870  *      @ha: hardware address
 871  *
 872  *      Search for an interface by MAC address. Returns NULL if the device
 873  *      is not found or a pointer to the device.
 874  *      The caller must hold RCU or RTNL.
 875  *      The returned device has not had its ref count increased
 876  *      and the caller must therefore be careful about locking
 877  *
 878  */
 879
 880 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 881                                        const char *ha)
 882 {
 883         struct net_device *dev;
 884
 885         for_each_netdev_rcu(net, dev)
 886                 if (dev->type == type &&
 887                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 888                         return dev;
 889
 890         return NULL;
 891 }
 892 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 893
 894 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 895 {
 896         struct net_device *dev;
 897
 898         ASSERT_RTNL();
 899         for_each_netdev(net, dev)
 900                 if (dev->type == type)
 901                         return dev;
 902
 903         return NULL;
 904 }
 905 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 906
 907 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 908 {
 909         struct net_device *dev, *ret = NULL;
 910
 911         rcu_read_lock();
 912         for_each_netdev_rcu(net, dev)
 913                 if (dev->type == type) {
 914                         dev_hold(dev);
 915                         ret = dev;
 916                         break;
 917                 }
 918         rcu_read_unlock();
 919         return ret;
 920 }
 921 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 922
 923 /**
 924  *      __dev_get_by_flags - find any device with given flags
 925  *      @net: the applicable net namespace
 926  *      @if_flags: IFF_* values
 927  *      @mask: bitmask of bits in if_flags to check
 928  *
 929  *      Search for any interface with the given flags. Returns NULL if a device
 930  *      is not found or a pointer to the device. Must be called inside
 931  *      rtnl_lock(), and result refcount is unchanged.
 932  */
 933
 934 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 935                                       unsigned short mask)
 936 {
 937         struct net_device *dev, *ret;
 938
 939         ASSERT_RTNL();
 940
 941         ret = NULL;
 942         for_each_netdev(net, dev) {
 943                 if (((dev->flags ^ if_flags) & mask) == 0) {
 944                         ret = dev;
 945                         break;
 946                 }
 947         }
 948         return ret;
 949 }
 950 EXPORT_SYMBOL(__dev_get_by_flags);
 951
 952 /**
 953  *      dev_valid_name - check if name is okay for network device
 954  *      @name: name string
 955  *
 956  *      Network device names need to be valid file names to
 957  *      to allow sysfs to work.  We also disallow any kind of
 958  *      whitespace.
 959  */
 960 bool dev_valid_name(const char *name)
 961 {
 962         if (*name == '\0')
 963                 return false;
 964         if (strlen(name) >= IFNAMSIZ)
 965                 return false;
 966         if (!strcmp(name, ".") || !strcmp(name, ".."))
 967                 return false;
 968
 969         while (*name) {
 970                 if (*name == '/' || *name == ':' || isspace(*name))
 971                         return false;
 972                 name++;
 973         }
 974         return true;
 975 }
 976 EXPORT_SYMBOL(dev_valid_name);
 977
 978 /**
 979  *      __dev_alloc_name - allocate a name for a device
 980  *      @net: network namespace to allocate the device name in
 981  *      @name: name format string
 982  *      @buf:  scratch buffer and result name string
 983  *
 984  *      Passed a format string - eg "lt%d" it will try and find a suitable
 985  *      id. It scans list of devices to build up a free map, then chooses
 986  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 987  *      while allocating the name and adding the device in order to avoid
 988  *      duplicates.
 989  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 990  *      Returns the number of the unit assigned or a negative errno code.
 991  */
 992
 993 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 994 {
 995         int i = 0;
 996         const char *p;
 997         const int max_netdevices = 8*PAGE_SIZE;
 998         unsigned long *inuse;
 999         struct net_device *d;
1000
1001         p = strnchr(name, IFNAMSIZ-1, '%');
1002         if (p) {
1003                 /*
1004                  * Verify the string as this thing may have come from
1005                  * the user.  There must be either one "%d" and no other "%"
1006                  * characters.
1007                  */
1008                 if (p[1] != 'd' || strchr(p + 2, '%'))
1009                         return -EINVAL;
1010
1011                 /* Use one page as a bit array of possible slots */
1012                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1013                 if (!inuse)
1014                         return -ENOMEM;
1015
1016                 for_each_netdev(net, d) {
1017                         if (!sscanf(d->name, name, &i))
1018                                 continue;
1019                         if (i < 0 || i >= max_netdevices)
1020                                 continue;
1021
1022                         /*  avoid cases where sscanf is not exact inverse of printf */
1023                         snprintf(buf, IFNAMSIZ, name, i);
1024                         if (!strncmp(buf, d->name, IFNAMSIZ))
1025                                 set_bit(i, inuse);
1026                 }
1027
1028                 i = find_first_zero_bit(inuse, max_netdevices);
1029                 free_page((unsigned long) inuse);
1030         }
1031
1032         if (buf != name)
1033                 snprintf(buf, IFNAMSIZ, name, i);
1034         if (!__dev_get_by_name(net, buf))
1035                 return i;
1036
1037         /* It is possible to run out of possible slots
1038          * when the name is long and there isn't enough space left
1039          * for the digits, or if all bits are used.
1040          */
1041         return -ENFILE;
1042 }
1043
1044 /**
1045  *      dev_alloc_name - allocate a name for a device
1046  *      @dev: device
1047  *      @name: name format string
1048  *
1049  *      Passed a format string - eg "lt%d" it will try and find a suitable
1050  *      id. It scans list of devices to build up a free map, then chooses
1051  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1052  *      while allocating the name and adding the device in order to avoid
1053  *      duplicates.
1054  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1055  *      Returns the number of the unit assigned or a negative errno code.
1056  */
1057
1058 int dev_alloc_name(struct net_device *dev, const char *name)
1059 {
1060         char buf[IFNAMSIZ];
1061         struct net *net;
1062         int ret;
1063
1064         BUG_ON(!dev_net(dev));
1065         net = dev_net(dev);
1066         ret = __dev_alloc_name(net, name, buf);
1067         if (ret >= 0)
1068                 strlcpy(dev->name, buf, IFNAMSIZ);
1069         return ret;
1070 }
1071 EXPORT_SYMBOL(dev_alloc_name);
1072
1073 static int dev_alloc_name_ns(struct net *net,
1074                              struct net_device *dev,
1075                              const char *name)
1076 {
1077         char buf[IFNAMSIZ];
1078         int ret;
1079
1080         ret = __dev_alloc_name(net, name, buf);
1081         if (ret >= 0)
1082                 strlcpy(dev->name, buf, IFNAMSIZ);
1083         return ret;
1084 }
1085
1086 static int dev_get_valid_name(struct net *net,
1087                               struct net_device *dev,
1088                               const char *name)
1089 {
1090         BUG_ON(!net);
1091
1092         if (!dev_valid_name(name))
1093                 return -EINVAL;
1094
1095         if (strchr(name, '%'))
1096                 return dev_alloc_name_ns(net, dev, name);
1097         else if (__dev_get_by_name(net, name))
1098                 return -EEXIST;
1099         else if (dev->name != name)
1100                 strlcpy(dev->name, name, IFNAMSIZ);
1101
1102         return 0;
1103 }
1104
1105 /**
1106  *      dev_change_name - change name of a device
1107  *      @dev: device
1108  *      @newname: name (or format string) must be at least IFNAMSIZ
1109  *
1110  *      Change name of a device, can pass format strings "eth%d".
1111  *      for wildcarding.
1112  */
1113 int dev_change_name(struct net_device *dev, const char *newname)
1114 {
1115         unsigned char old_assign_type;
1116         char oldname[IFNAMSIZ];
1117         int err = 0;
1118         int ret;
1119         struct net *net;
1120
1121         ASSERT_RTNL();
1122         BUG_ON(!dev_net(dev));
1123
1124         net = dev_net(dev);
1125         if (dev->flags & IFF_UP)
1126                 return -EBUSY;
1127
1128         write_seqcount_begin(&devnet_rename_seq);
1129
1130         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1131                 write_seqcount_end(&devnet_rename_seq);
1132                 return 0;
1133         }
1134
1135         memcpy(oldname, dev->name, IFNAMSIZ);
1136
1137         err = dev_get_valid_name(net, dev, newname);
1138         if (err < 0) {
1139                 write_seqcount_end(&devnet_rename_seq);
1140                 return err;
1141         }
1142
1143         if (oldname[0] && !strchr(oldname, '%'))
1144                 netdev_info(dev, "renamed from %s\n", oldname);
1145
1146         old_assign_type = dev->name_assign_type;
1147         dev->name_assign_type = NET_NAME_RENAMED;
1148
1149 rollback:
1150         ret = device_rename(&dev->dev, dev->name);
1151         if (ret) {
1152                 memcpy(dev->name, oldname, IFNAMSIZ);
1153                 dev->name_assign_type = old_assign_type;
1154                 write_seqcount_end(&devnet_rename_seq);
1155                 return ret;
1156         }
1157
1158         write_seqcount_end(&devnet_rename_seq);
1159
1160         netdev_adjacent_rename_links(dev, oldname);
1161
1162         write_lock_bh(&dev_base_lock);
1163         hlist_del_rcu(&dev->name_hlist);
1164         write_unlock_bh(&dev_base_lock);
1165
1166         synchronize_rcu();
1167
1168         write_lock_bh(&dev_base_lock);
1169         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1170         write_unlock_bh(&dev_base_lock);
1171
1172         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1173         ret = notifier_to_errno(ret);
1174
1175         if (ret) {
1176                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1177                 if (err >= 0) {
1178                         err = ret;
1179                         write_seqcount_begin(&devnet_rename_seq);
1180                         memcpy(dev->name, oldname, IFNAMSIZ);
1181                         memcpy(oldname, newname, IFNAMSIZ);
1182                         dev->name_assign_type = old_assign_type;
1183                         old_assign_type = NET_NAME_RENAMED;
1184                         goto rollback;
1185                 } else {
1186                         pr_err("%s: name change rollback failed: %d\n",
1187                                dev->name, ret);
1188                 }
1189         }
1190
1191         return err;
1192 }
1193
1194 /**
1195  *      dev_set_alias - change ifalias of a device
1196  *      @dev: device
1197  *      @alias: name up to IFALIASZ
1198  *      @len: limit of bytes to copy from info
1199  *
1200  *      Set ifalias for a device,
1201  */
1202 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1203 {
1204         char *new_ifalias;
1205
1206         ASSERT_RTNL();
1207
1208         if (len >= IFALIASZ)
1209                 return -EINVAL;
1210
1211         if (!len) {
1212                 kfree(dev->ifalias);
1213                 dev->ifalias = NULL;
1214                 return 0;
1215         }
1216
1217         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1218         if (!new_ifalias)
1219                 return -ENOMEM;
1220         dev->ifalias = new_ifalias;
1221
1222         strlcpy(dev->ifalias, alias, len+1);
1223         return len;
1224 }
1225
1226
1227 /**
1228  *      netdev_features_change - device changes features
1229  *      @dev: device to cause notification
1230  *
1231  *      Called to indicate a device has changed features.
1232  */
1233 void netdev_features_change(struct net_device *dev)
1234 {
1235         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1236 }
1237 EXPORT_SYMBOL(netdev_features_change);
1238
1239 /**
1240  *      netdev_state_change - device changes state
1241  *      @dev: device to cause notification
1242  *
1243  *      Called to indicate a device has changed state. This function calls
1244  *      the notifier chains for netdev_chain and sends a NEWLINK message
1245  *      to the routing socket.
1246  */
1247 void netdev_state_change(struct net_device *dev)
1248 {
1249         if (dev->flags & IFF_UP) {
1250                 struct netdev_notifier_change_info change_info;
1251
1252                 change_info.flags_changed = 0;
1253                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1254                                               &change_info.info);
1255                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1256         }
1257 }
1258 EXPORT_SYMBOL(netdev_state_change);
1259
1260 /**
1261  *      netdev_notify_peers - notify network peers about existence of @dev
1262  *      @dev: network device
1263  *
1264  * Generate traffic such that interested network peers are aware of
1265  * @dev, such as by generating a gratuitous ARP. This may be used when
1266  * a device wants to inform the rest of the network about some sort of
1267  * reconfiguration such as a failover event or virtual machine
1268  * migration.
1269  */
1270 void netdev_notify_peers(struct net_device *dev)
1271 {
1272         rtnl_lock();
1273         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1274         rtnl_unlock();
1275 }
1276 EXPORT_SYMBOL(netdev_notify_peers);
1277
1278 static int __dev_open(struct net_device *dev)
1279 {
1280         const struct net_device_ops *ops = dev->netdev_ops;
1281         int ret;
1282
1283         ASSERT_RTNL();
1284
1285         if (!netif_device_present(dev))
1286                 return -ENODEV;
1287
1288         /* Block netpoll from trying to do any rx path servicing.
1289          * If we don't do this there is a chance ndo_poll_controller
1290          * or ndo_poll may be running while we open the device
1291          */
1292         netpoll_poll_disable(dev);
1293
1294         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1295         ret = notifier_to_errno(ret);
1296         if (ret)
1297                 return ret;
1298
1299         set_bit(__LINK_STATE_START, &dev->state);
1300
1301         if (ops->ndo_validate_addr)
1302                 ret = ops->ndo_validate_addr(dev);
1303
1304         if (!ret && ops->ndo_open)
1305                 ret = ops->ndo_open(dev);
1306
1307         netpoll_poll_enable(dev);
1308
1309         if (ret)
1310                 clear_bit(__LINK_STATE_START, &dev->state);
1311         else {
1312                 dev->flags |= IFF_UP;
1313                 dev_set_rx_mode(dev);
1314                 dev_activate(dev);
1315                 add_device_randomness(dev->dev_addr, dev->addr_len);
1316         }
1317
1318         return ret;
1319 }
1320
1321 /**
1322  *      dev_open        - prepare an interface for use.
1323  *      @dev:   device to open
1324  *
1325  *      Takes a device from down to up state. The device's private open
1326  *      function is invoked and then the multicast lists are loaded. Finally
1327  *      the device is moved into the up state and a %NETDEV_UP message is
1328  *      sent to the netdev notifier chain.
1329  *
1330  *      Calling this function on an active interface is a nop. On a failure
1331  *      a negative errno code is returned.
1332  */
1333 int dev_open(struct net_device *dev)
1334 {
1335         int ret;
1336
1337         if (dev->flags & IFF_UP)
1338                 return 0;
1339
1340         ret = __dev_open(dev);
1341         if (ret < 0)
1342                 return ret;
1343
1344         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1345         call_netdevice_notifiers(NETDEV_UP, dev);
1346
1347         return ret;
1348 }
1349 EXPORT_SYMBOL(dev_open);
1350
1351 static int __dev_close_many(struct list_head *head)
1352 {
1353         struct net_device *dev;
1354
1355         ASSERT_RTNL();
1356         might_sleep();
1357
1358         list_for_each_entry(dev, head, close_list) {
1359                 /* Temporarily disable netpoll until the interface is down */
1360                 netpoll_poll_disable(dev);
1361
1362                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1363
1364                 clear_bit(__LINK_STATE_START, &dev->state);
1365
1366                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1367                  * can be even on different cpu. So just clear netif_running().
1368                  *
1369                  * dev->stop() will invoke napi_disable() on all of it's
1370                  * napi_struct instances on this device.
1371                  */
1372                 smp_mb__after_atomic(); /* Commit netif_running(). */
1373         }
1374
1375         dev_deactivate_many(head);
1376
1377         list_for_each_entry(dev, head, close_list) {
1378                 const struct net_device_ops *ops = dev->netdev_ops;
1379
1380                 /*
1381                  *      Call the device specific close. This cannot fail.
1382                  *      Only if device is UP
1383                  *
1384                  *      We allow it to be called even after a DETACH hot-plug
1385                  *      event.
1386                  */
1387                 if (ops->ndo_stop)
1388                         ops->ndo_stop(dev);
1389
1390                 dev->flags &= ~IFF_UP;
1391                 netpoll_poll_enable(dev);
1392         }
1393
1394         return 0;
1395 }
1396
1397 static int __dev_close(struct net_device *dev)
1398 {
1399         int retval;
1400         LIST_HEAD(single);
1401
1402         list_add(&dev->close_list, &single);
1403         retval = __dev_close_many(&single);
1404         list_del(&single);
1405
1406         return retval;
1407 }
1408
1409 int dev_close_many(struct list_head *head, bool unlink)
1410 {
1411         struct net_device *dev, *tmp;
1412
1413         /* Remove the devices that don't need to be closed */
1414         list_for_each_entry_safe(dev, tmp, head, close_list)
1415                 if (!(dev->flags & IFF_UP))
1416                         list_del_init(&dev->close_list);
1417
1418         __dev_close_many(head);
1419
1420         list_for_each_entry_safe(dev, tmp, head, close_list) {
1421                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1422                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1423                 if (unlink)
1424                         list_del_init(&dev->close_list);
1425         }
1426
1427         return 0;
1428 }
1429 EXPORT_SYMBOL(dev_close_many);
1430
1431 /**
1432  *      dev_close - shutdown an interface.
1433  *      @dev: device to shutdown
1434  *
1435  *      This function moves an active device into down state. A
1436  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1437  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1438  *      chain.
1439  */
1440 int dev_close(struct net_device *dev)
1441 {
1442         if (dev->flags & IFF_UP) {
1443                 LIST_HEAD(single);
1444
1445                 list_add(&dev->close_list, &single);
1446                 dev_close_many(&single, true);
1447                 list_del(&single);
1448         }
1449         return 0;
1450 }
1451 EXPORT_SYMBOL(dev_close);
1452
1453
1454 /**
1455  *      dev_disable_lro - disable Large Receive Offload on a device
1456  *      @dev: device
1457  *
1458  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1459  *      called under RTNL.  This is needed if received packets may be
1460  *      forwarded to another interface.
1461  */
1462 void dev_disable_lro(struct net_device *dev)
1463 {
1464         struct net_device *lower_dev;
1465         struct list_head *iter;
1466
1467         dev->wanted_features &= ~NETIF_F_LRO;
1468         netdev_update_features(dev);
1469
1470         if (unlikely(dev->features & NETIF_F_LRO))
1471                 netdev_WARN(dev, "failed to disable LRO!\n");
1472
1473         netdev_for_each_lower_dev(dev, lower_dev, iter)
1474                 dev_disable_lro(lower_dev);
1475 }
1476 EXPORT_SYMBOL(dev_disable_lro);
1477
1478 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1479                                    struct net_device *dev)
1480 {
1481         struct netdev_notifier_info info;
1482
1483         netdev_notifier_info_init(&info, dev);
1484         return nb->notifier_call(nb, val, &info);
1485 }
1486
1487 static int dev_boot_phase = 1;
1488
1489 /**
1490  *      register_netdevice_notifier - register a network notifier block
1491  *      @nb: notifier
1492  *
1493  *      Register a notifier to be called when network device events occur.
1494  *      The notifier passed is linked into the kernel structures and must
1495  *      not be reused until it has been unregistered. A negative errno code
1496  *      is returned on a failure.
1497  *
1498  *      When registered all registration and up events are replayed
1499  *      to the new notifier to allow device to have a race free
1500  *      view of the network device list.
1501  */
1502
1503 int register_netdevice_notifier(struct notifier_block *nb)
1504 {
1505         struct net_device *dev;
1506         struct net_device *last;
1507         struct net *net;
1508         int err;
1509
1510         rtnl_lock();
1511         err = raw_notifier_chain_register(&netdev_chain, nb);
1512         if (err)
1513                 goto unlock;
1514         if (dev_boot_phase)
1515                 goto unlock;
1516         for_each_net(net) {
1517                 for_each_netdev(net, dev) {
1518                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1519                         err = notifier_to_errno(err);
1520                         if (err)
1521                                 goto rollback;
1522
1523                         if (!(dev->flags & IFF_UP))
1524                                 continue;
1525
1526                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1527                 }
1528         }
1529
1530 unlock:
1531         rtnl_unlock();
1532         return err;
1533
1534 rollback:
1535         last = dev;
1536         for_each_net(net) {
1537                 for_each_netdev(net, dev) {
1538                         if (dev == last)
1539                                 goto outroll;
1540
1541                         if (dev->flags & IFF_UP) {
1542                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1543                                                         dev);
1544                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1545                         }
1546                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1547                 }
1548         }
1549
1550 outroll:
1551         raw_notifier_chain_unregister(&netdev_chain, nb);
1552         goto unlock;
1553 }
1554 EXPORT_SYMBOL(register_netdevice_notifier);
1555
1556 /**
1557  *      unregister_netdevice_notifier - unregister a network notifier block
1558  *      @nb: notifier
1559  *
1560  *      Unregister a notifier previously registered by
1561  *      register_netdevice_notifier(). The notifier is unlinked into the
1562  *      kernel structures and may then be reused. A negative errno code
1563  *      is returned on a failure.
1564  *
1565  *      After unregistering unregister and down device events are synthesized
1566  *      for all devices on the device list to the removed notifier to remove
1567  *      the need for special case cleanup code.
1568  */
1569
1570 int unregister_netdevice_notifier(struct notifier_block *nb)
1571 {
1572         struct net_device *dev;
1573         struct net *net;
1574         int err;
1575
1576         rtnl_lock();
1577         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1578         if (err)
1579                 goto unlock;
1580
1581         for_each_net(net) {
1582                 for_each_netdev(net, dev) {
1583                         if (dev->flags & IFF_UP) {
1584                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1585                                                         dev);
1586                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1587                         }
1588                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1589                 }
1590         }
1591 unlock:
1592         rtnl_unlock();
1593         return err;
1594 }
1595 EXPORT_SYMBOL(unregister_netdevice_notifier);
1596
1597 /**
1598  *      call_netdevice_notifiers_info - call all network notifier blocks
1599  *      @val: value passed unmodified to notifier function
1600  *      @dev: net_device pointer passed unmodified to notifier function
1601  *      @info: notifier information data
1602  *
1603  *      Call all network notifier blocks.  Parameters and return value
1604  *      are as for raw_notifier_call_chain().
1605  */
1606
1607 static int call_netdevice_notifiers_info(unsigned long val,
1608                                          struct net_device *dev,
1609                                          struct netdev_notifier_info *info)
1610 {
1611         ASSERT_RTNL();
1612         netdev_notifier_info_init(info, dev);
1613         return raw_notifier_call_chain(&netdev_chain, val, info);
1614 }
1615
1616 /**
1617  *      call_netdevice_notifiers - call all network notifier blocks
1618  *      @val: value passed unmodified to notifier function
1619  *      @dev: net_device pointer passed unmodified to notifier function
1620  *
1621  *      Call all network notifier blocks.  Parameters and return value
1622  *      are as for raw_notifier_call_chain().
1623  */
1624
1625 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1626 {
1627         struct netdev_notifier_info info;
1628
1629         return call_netdevice_notifiers_info(val, dev, &info);
1630 }
1631 EXPORT_SYMBOL(call_netdevice_notifiers);
1632
1633 #ifdef CONFIG_NET_CLS_ACT
1634 static struct static_key ingress_needed __read_mostly;
1635
1636 void net_inc_ingress_queue(void)
1637 {
1638         static_key_slow_inc(&ingress_needed);
1639 }
1640 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1641
1642 void net_dec_ingress_queue(void)
1643 {
1644         static_key_slow_dec(&ingress_needed);
1645 }
1646 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1647 #endif
1648
1649 static struct static_key netstamp_needed __read_mostly;
1650 #ifdef HAVE_JUMP_LABEL
1651 /* We are not allowed to call static_key_slow_dec() from irq context
1652  * If net_disable_timestamp() is called from irq context, defer the
1653  * static_key_slow_dec() calls.
1654  */
1655 static atomic_t netstamp_needed_deferred;
1656 #endif
1657
1658 void net_enable_timestamp(void)
1659 {
1660 #ifdef HAVE_JUMP_LABEL
1661         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1662
1663         if (deferred) {
1664                 while (--deferred)
1665                         static_key_slow_dec(&netstamp_needed);
1666                 return;
1667         }
1668 #endif
1669         static_key_slow_inc(&netstamp_needed);
1670 }
1671 EXPORT_SYMBOL(net_enable_timestamp);
1672
1673 void net_disable_timestamp(void)
1674 {
1675 #ifdef HAVE_JUMP_LABEL
1676         if (in_interrupt()) {
1677                 atomic_inc(&netstamp_needed_deferred);
1678                 return;
1679         }
1680 #endif
1681         static_key_slow_dec(&netstamp_needed);
1682 }
1683 EXPORT_SYMBOL(net_disable_timestamp);
1684
1685 static inline void net_timestamp_set(struct sk_buff *skb)
1686 {
1687         skb->tstamp.tv64 = 0;
1688         if (static_key_false(&netstamp_needed))
1689                 __net_timestamp(skb);
1690 }
1691
1692 #define net_timestamp_check(COND, SKB)                  \
1693         if (static_key_false(&netstamp_needed)) {               \
1694                 if ((COND) && !(SKB)->tstamp.tv64)      \
1695                         __net_timestamp(SKB);           \
1696         }                                               \
1697
1698 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1699 {
1700         unsigned int len;
1701
1702         if (!(dev->flags & IFF_UP))
1703                 return false;
1704
1705         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1706         if (skb->len <= len)
1707                 return true;
1708
1709         /* if TSO is enabled, we don't care about the length as the packet
1710          * could be forwarded without being segmented before
1711          */
1712         if (skb_is_gso(skb))
1713                 return true;
1714
1715         return false;
1716 }
1717 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1718
1719 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1720 {
1721         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1722                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1723                         atomic_long_inc(&dev->rx_dropped);
1724                         kfree_skb(skb);
1725                         return NET_RX_DROP;
1726                 }
1727         }
1728
1729         if (unlikely(!is_skb_forwardable(dev, skb))) {
1730                 atomic_long_inc(&dev->rx_dropped);
1731                 kfree_skb(skb);
1732                 return NET_RX_DROP;
1733         }
1734
1735         skb_scrub_packet(skb, true);
1736         skb->priority = 0;
1737         skb->protocol = eth_type_trans(skb, dev);
1738         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1739
1740         return 0;
1741 }
1742 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1743
1744 /**
1745  * dev_forward_skb - loopback an skb to another netif
1746  *
1747  * @dev: destination network device
1748  * @skb: buffer to forward
1749  *
1750  * return values:
1751  *      NET_RX_SUCCESS  (no congestion)
1752  *      NET_RX_DROP     (packet was dropped, but freed)
1753  *
1754  * dev_forward_skb can be used for injecting an skb from the
1755  * start_xmit function of one device into the receive queue
1756  * of another device.
1757  *
1758  * The receiving device may be in another namespace, so
1759  * we have to clear all information in the skb that could
1760  * impact namespace isolation.
1761  */
1762 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1763 {
1764         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1765 }
1766 EXPORT_SYMBOL_GPL(dev_forward_skb);
1767
1768 static inline int deliver_skb(struct sk_buff *skb,
1769                               struct packet_type *pt_prev,
1770                               struct net_device *orig_dev)
1771 {
1772         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1773                 return -ENOMEM;
1774         atomic_inc(&skb->users);
1775         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1776 }
1777
1778 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1779                                           struct packet_type **pt,
1780                                           struct net_device *orig_dev,
1781                                           __be16 type,
1782                                           struct list_head *ptype_list)
1783 {
1784         struct packet_type *ptype, *pt_prev = *pt;
1785
1786         list_for_each_entry_rcu(ptype, ptype_list, list) {
1787                 if (ptype->type != type)
1788                         continue;
1789                 if (pt_prev)
1790                         deliver_skb(skb, pt_prev, orig_dev);
1791                 pt_prev = ptype;
1792         }
1793         *pt = pt_prev;
1794 }
1795
1796 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1797 {
1798         if (!ptype->af_packet_priv || !skb->sk)
1799                 return false;
1800
1801         if (ptype->id_match)
1802                 return ptype->id_match(ptype, skb->sk);
1803         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1804                 return true;
1805
1806         return false;
1807 }
1808
1809 /*
1810  *      Support routine. Sends outgoing frames to any network
1811  *      taps currently in use.
1812  */
1813
1814 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1815 {
1816         struct packet_type *ptype;
1817         struct sk_buff *skb2 = NULL;
1818         struct packet_type *pt_prev = NULL;
1819         struct list_head *ptype_list = &ptype_all;
1820
1821         rcu_read_lock();
1822 again:
1823         list_for_each_entry_rcu(ptype, ptype_list, list) {
1824                 /* Never send packets back to the socket
1825                  * they originated from - MvS (miquels@drinkel.ow.org)
1826                  */
1827                 if (skb_loop_sk(ptype, skb))
1828                         continue;
1829
1830                 if (pt_prev) {
1831                         deliver_skb(skb2, pt_prev, skb->dev);
1832                         pt_prev = ptype;
1833                         continue;
1834                 }
1835
1836                 /* need to clone skb, done only once */
1837                 skb2 = skb_clone(skb, GFP_ATOMIC);
1838                 if (!skb2)
1839                         goto out_unlock;
1840
1841                 net_timestamp_set(skb2);
1842
1843                 /* skb->nh should be correctly
1844                  * set by sender, so that the second statement is
1845                  * just protection against buggy protocols.
1846                  */
1847                 skb_reset_mac_header(skb2);
1848
1849                 if (skb_network_header(skb2) < skb2->data ||
1850                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1851                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1852                                              ntohs(skb2->protocol),
1853                                              dev->name);
1854                         skb_reset_network_header(skb2);
1855                 }
1856
1857                 skb2->transport_header = skb2->network_header;
1858                 skb2->pkt_type = PACKET_OUTGOING;
1859                 pt_prev = ptype;
1860         }
1861
1862         if (ptype_list == &ptype_all) {
1863                 ptype_list = &dev->ptype_all;
1864                 goto again;
1865         }
1866 out_unlock:
1867         if (pt_prev)
1868                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1869         rcu_read_unlock();
1870 }
1871
1872 /**
1873  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1874  * @dev: Network device
1875  * @txq: number of queues available
1876  *
1877  * If real_num_tx_queues is changed the tc mappings may no longer be
1878  * valid. To resolve this verify the tc mapping remains valid and if
1879  * not NULL the mapping. With no priorities mapping to this
1880  * offset/count pair it will no longer be used. In the worst case TC0
1881  * is invalid nothing can be done so disable priority mappings. If is
1882  * expected that drivers will fix this mapping if they can before
1883  * calling netif_set_real_num_tx_queues.
1884  */
1885 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1886 {
1887         int i;
1888         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1889
1890         /* If TC0 is invalidated disable TC mapping */
1891         if (tc->offset + tc->count > txq) {
1892                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1893                 dev->num_tc = 0;
1894                 return;
1895         }
1896
1897         /* Invalidated prio to tc mappings set to TC0 */
1898         for (i = 1; i < TC_BITMASK + 1; i++) {
1899                 int q = netdev_get_prio_tc_map(dev, i);
1900
1901                 tc = &dev->tc_to_txq[q];
1902                 if (tc->offset + tc->count > txq) {
1903                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1904                                 i, q);
1905                         netdev_set_prio_tc_map(dev, i, 0);
1906                 }
1907         }
1908 }
1909
1910 #ifdef CONFIG_XPS
1911 static DEFINE_MUTEX(xps_map_mutex);
1912 #define xmap_dereference(P)             \
1913         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1914
1915 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1916                                         int cpu, u16 index)
1917 {
1918         struct xps_map *map = NULL;
1919         int pos;
1920
1921         if (dev_maps)
1922                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1923
1924         for (pos = 0; map && pos < map->len; pos++) {
1925                 if (map->queues[pos] == index) {
1926                         if (map->len > 1) {
1927                                 map->queues[pos] = map->queues[--map->len];
1928                         } else {
1929                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1930                                 kfree_rcu(map, rcu);
1931                                 map = NULL;
1932                         }
1933                         break;
1934                 }
1935         }
1936
1937         return map;
1938 }
1939
1940 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1941 {
1942         struct xps_dev_maps *dev_maps;
1943         int cpu, i;
1944         bool active = false;
1945
1946         mutex_lock(&xps_map_mutex);
1947         dev_maps = xmap_dereference(dev->xps_maps);
1948
1949         if (!dev_maps)
1950                 goto out_no_maps;
1951
1952         for_each_possible_cpu(cpu) {
1953                 for (i = index; i < dev->num_tx_queues; i++) {
1954                         if (!remove_xps_queue(dev_maps, cpu, i))
1955                                 break;
1956                 }
1957                 if (i == dev->num_tx_queues)
1958                         active = true;
1959         }
1960
1961         if (!active) {
1962                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1963                 kfree_rcu(dev_maps, rcu);
1964         }
1965
1966         for (i = index; i < dev->num_tx_queues; i++)
1967                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1968                                              NUMA_NO_NODE);
1969
1970 out_no_maps:
1971         mutex_unlock(&xps_map_mutex);
1972 }
1973
1974 static struct xps_map *expand_xps_map(struct xps_map *map,
1975                                       int cpu, u16 index)
1976 {
1977         struct xps_map *new_map;
1978         int alloc_len = XPS_MIN_MAP_ALLOC;
1979         int i, pos;
1980
1981         for (pos = 0; map && pos < map->len; pos++) {
1982                 if (map->queues[pos] != index)
1983                         continue;
1984                 return map;
1985         }
1986
1987         /* Need to add queue to this CPU's existing map */
1988         if (map) {
1989                 if (pos < map->alloc_len)
1990                         return map;
1991
1992                 alloc_len = map->alloc_len * 2;
1993         }
1994
1995         /* Need to allocate new map to store queue on this CPU's map */
1996         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1997                                cpu_to_node(cpu));
1998         if (!new_map)
1999                 return NULL;
2000
2001         for (i = 0; i < pos; i++)
2002                 new_map->queues[i] = map->queues[i];
2003         new_map->alloc_len = alloc_len;
2004         new_map->len = pos;
2005
2006         return new_map;
2007 }
2008
2009 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2010                         u16 index)
2011 {
2012         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2013         struct xps_map *map, *new_map;
2014         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2015         int cpu, numa_node_id = -2;
2016         bool active = false;
2017
2018         mutex_lock(&xps_map_mutex);
2019
2020         dev_maps = xmap_dereference(dev->xps_maps);
2021
2022         /* allocate memory for queue storage */
2023         for_each_online_cpu(cpu) {
2024                 if (!cpumask_test_cpu(cpu, mask))
2025                         continue;
2026
2027                 if (!new_dev_maps)
2028                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2029                 if (!new_dev_maps) {
2030                         mutex_unlock(&xps_map_mutex);
2031                         return -ENOMEM;
2032                 }
2033
2034                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2035                                  NULL;
2036
2037                 map = expand_xps_map(map, cpu, index);
2038                 if (!map)
2039                         goto error;
2040
2041                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2042         }
2043
2044         if (!new_dev_maps)
2045                 goto out_no_new_maps;
2046
2047         for_each_possible_cpu(cpu) {
2048                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2049                         /* add queue to CPU maps */
2050                         int pos = 0;
2051
2052                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2053                         while ((pos < map->len) && (map->queues[pos] != index))
2054                                 pos++;
2055
2056                         if (pos == map->len)
2057                                 map->queues[map->len++] = index;
2058 #ifdef CONFIG_NUMA
2059                         if (numa_node_id == -2)
2060                                 numa_node_id = cpu_to_node(cpu);
2061                         else if (numa_node_id != cpu_to_node(cpu))
2062                                 numa_node_id = -1;
2063 #endif
2064                 } else if (dev_maps) {
2065                         /* fill in the new device map from the old device map */
2066                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2067                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2068                 }
2069
2070         }
2071
2072         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2073
2074         /* Cleanup old maps */
2075         if (dev_maps) {
2076                 for_each_possible_cpu(cpu) {
2077                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2078                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2079                         if (map && map != new_map)
2080                                 kfree_rcu(map, rcu);
2081                 }
2082
2083                 kfree_rcu(dev_maps, rcu);
2084         }
2085
2086         dev_maps = new_dev_maps;
2087         active = true;
2088
2089 out_no_new_maps:
2090         /* update Tx queue numa node */
2091         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2092                                      (numa_node_id >= 0) ? numa_node_id :
2093                                      NUMA_NO_NODE);
2094
2095         if (!dev_maps)
2096                 goto out_no_maps;
2097
2098         /* removes queue from unused CPUs */
2099         for_each_possible_cpu(cpu) {
2100                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2101                         continue;
2102
2103                 if (remove_xps_queue(dev_maps, cpu, index))
2104                         active = true;
2105         }
2106
2107         /* free map if not active */
2108         if (!active) {
2109                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2110                 kfree_rcu(dev_maps, rcu);
2111         }
2112
2113 out_no_maps:
2114         mutex_unlock(&xps_map_mutex);
2115
2116         return 0;
2117 error:
2118         /* remove any maps that we added */
2119         for_each_possible_cpu(cpu) {
2120                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2121                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2122                                  NULL;
2123                 if (new_map && new_map != map)
2124                         kfree(new_map);
2125         }
2126
2127         mutex_unlock(&xps_map_mutex);
2128
2129         kfree(new_dev_maps);
2130         return -ENOMEM;
2131 }
2132 EXPORT_SYMBOL(netif_set_xps_queue);
2133
2134 #endif
2135 /*
2136  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2137  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2138  */
2139 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2140 {
2141         int rc;
2142
2143         if (txq < 1 || txq > dev->num_tx_queues)
2144                 return -EINVAL;
2145
2146         if (dev->reg_state == NETREG_REGISTERED ||
2147             dev->reg_state == NETREG_UNREGISTERING) {
2148                 ASSERT_RTNL();
2149
2150                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2151                                                   txq);
2152                 if (rc)
2153                         return rc;
2154
2155                 if (dev->num_tc)
2156                         netif_setup_tc(dev, txq);
2157
2158                 if (txq < dev->real_num_tx_queues) {
2159                         qdisc_reset_all_tx_gt(dev, txq);
2160 #ifdef CONFIG_XPS
2161                         netif_reset_xps_queues_gt(dev, txq);
2162 #endif
2163                 }
2164         }
2165
2166         dev->real_num_tx_queues = txq;
2167         return 0;
2168 }
2169 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2170
2171 #ifdef CONFIG_SYSFS
2172 /**
2173  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2174  *      @dev: Network device
2175  *      @rxq: Actual number of RX queues
2176  *
2177  *      This must be called either with the rtnl_lock held or before
2178  *      registration of the net device.  Returns 0 on success, or a
2179  *      negative error code.  If called before registration, it always
2180  *      succeeds.
2181  */
2182 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2183 {
2184         int rc;
2185
2186         if (rxq < 1 || rxq > dev->num_rx_queues)
2187                 return -EINVAL;
2188
2189         if (dev->reg_state == NETREG_REGISTERED) {
2190                 ASSERT_RTNL();
2191
2192                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2193                                                   rxq);
2194                 if (rc)
2195                         return rc;
2196         }
2197
2198         dev->real_num_rx_queues = rxq;
2199         return 0;
2200 }
2201 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2202 #endif
2203
2204 /**
2205  * netif_get_num_default_rss_queues - default number of RSS queues
2206  *
2207  * This routine should set an upper limit on the number of RSS queues
2208  * used by default by multiqueue devices.
2209  */
2210 int netif_get_num_default_rss_queues(void)
2211 {
2212         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2213 }
2214 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2215
2216 static inline void __netif_reschedule(struct Qdisc *q)
2217 {
2218         struct softnet_data *sd;
2219         unsigned long flags;
2220
2221         local_irq_save(flags);
2222         sd = this_cpu_ptr(&softnet_data);
2223         q->next_sched = NULL;
2224         *sd->output_queue_tailp = q;
2225         sd->output_queue_tailp = &q->next_sched;
2226         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2227         local_irq_restore(flags);
2228 }
2229
2230 void __netif_schedule(struct Qdisc *q)
2231 {
2232         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2233                 __netif_reschedule(q);
2234 }
2235 EXPORT_SYMBOL(__netif_schedule);
2236
2237 struct dev_kfree_skb_cb {
2238         enum skb_free_reason reason;
2239 };
2240
2241 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2242 {
2243         return (struct dev_kfree_skb_cb *)skb->cb;
2244 }
2245
2246 void netif_schedule_queue(struct netdev_queue *txq)
2247 {
2248         rcu_read_lock();
2249         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2250                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2251
2252                 __netif_schedule(q);
2253         }
2254         rcu_read_unlock();
2255 }
2256 EXPORT_SYMBOL(netif_schedule_queue);
2257
2258 /**
2259  *      netif_wake_subqueue - allow sending packets on subqueue
2260  *      @dev: network device
2261  *      @queue_index: sub queue index
2262  *
2263  * Resume individual transmit queue of a device with multiple transmit queues.
2264  */
2265 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2266 {
2267         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2268
2269         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2270                 struct Qdisc *q;
2271
2272                 rcu_read_lock();
2273                 q = rcu_dereference(txq->qdisc);
2274                 __netif_schedule(q);
2275                 rcu_read_unlock();
2276         }
2277 }
2278 EXPORT_SYMBOL(netif_wake_subqueue);
2279
2280 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2281 {
2282         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2283                 struct Qdisc *q;
2284
2285                 rcu_read_lock();
2286                 q = rcu_dereference(dev_queue->qdisc);
2287                 __netif_schedule(q);
2288                 rcu_read_unlock();
2289         }
2290 }
2291 EXPORT_SYMBOL(netif_tx_wake_queue);
2292
2293 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2294 {
2295         unsigned long flags;
2296
2297         if (likely(atomic_read(&skb->users) == 1)) {
2298                 smp_rmb();
2299                 atomic_set(&skb->users, 0);
2300         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2301                 return;
2302         }
2303         get_kfree_skb_cb(skb)->reason = reason;
2304         local_irq_save(flags);
2305         skb->next = __this_cpu_read(softnet_data.completion_queue);
2306         __this_cpu_write(softnet_data.completion_queue, skb);
2307         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2308         local_irq_restore(flags);
2309 }
2310 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2311
2312 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2313 {
2314         if (in_irq() || irqs_disabled())
2315                 __dev_kfree_skb_irq(skb, reason);
2316         else
2317                 dev_kfree_skb(skb);
2318 }
2319 EXPORT_SYMBOL(__dev_kfree_skb_any);
2320
2321
2322 /**
2323  * netif_device_detach - mark device as removed
2324  * @dev: network device
2325  *
2326  * Mark device as removed from system and therefore no longer available.
2327  */
2328 void netif_device_detach(struct net_device *dev)
2329 {
2330         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2331             netif_running(dev)) {
2332                 netif_tx_stop_all_queues(dev);
2333         }
2334 }
2335 EXPORT_SYMBOL(netif_device_detach);
2336
2337 /**
2338  * netif_device_attach - mark device as attached
2339  * @dev: network device
2340  *
2341  * Mark device as attached from system and restart if needed.
2342  */
2343 void netif_device_attach(struct net_device *dev)
2344 {
2345         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2346             netif_running(dev)) {
2347                 netif_tx_wake_all_queues(dev);
2348                 __netdev_watchdog_up(dev);
2349         }
2350 }
2351 EXPORT_SYMBOL(netif_device_attach);
2352
2353 static void skb_warn_bad_offload(const struct sk_buff *skb)
2354 {
2355         static const netdev_features_t null_features = 0;
2356         struct net_device *dev = skb->dev;
2357         const char *driver = "";
2358
2359         if (!net_ratelimit())
2360                 return;
2361
2362         if (dev && dev->dev.parent)
2363                 driver = dev_driver_string(dev->dev.parent);
2364
2365         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2366              "gso_type=%d ip_summed=%d\n",
2367              driver, dev ? &dev->features : &null_features,
2368              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2369              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2370              skb_shinfo(skb)->gso_type, skb->ip_summed);
2371 }
2372
2373 /*
2374  * Invalidate hardware checksum when packet is to be mangled, and
2375  * complete checksum manually on outgoing path.
2376  */
2377 int skb_checksum_help(struct sk_buff *skb)
2378 {
2379         __wsum csum;
2380         int ret = 0, offset;
2381
2382         if (skb->ip_summed == CHECKSUM_COMPLETE)
2383                 goto out_set_summed;
2384
2385         if (unlikely(skb_shinfo(skb)->gso_size)) {
2386                 skb_warn_bad_offload(skb);
2387                 return -EINVAL;
2388         }
2389
2390         /* Before computing a checksum, we should make sure no frag could
2391          * be modified by an external entity : checksum could be wrong.
2392          */
2393         if (skb_has_shared_frag(skb)) {
2394                 ret = __skb_linearize(skb);
2395                 if (ret)
2396                         goto out;
2397         }
2398
2399         offset = skb_checksum_start_offset(skb);
2400         BUG_ON(offset >= skb_headlen(skb));
2401         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2402
2403         offset += skb->csum_offset;
2404         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2405
2406         if (skb_cloned(skb) &&
2407             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2408                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2409                 if (ret)
2410                         goto out;
2411         }
2412
2413         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2414 out_set_summed:
2415         skb->ip_summed = CHECKSUM_NONE;
2416 out:
2417         return ret;
2418 }
2419 EXPORT_SYMBOL(skb_checksum_help);
2420
2421 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2422 {
2423         __be16 type = skb->protocol;
2424
2425         /* Tunnel gso handlers can set protocol to ethernet. */
2426         if (type == htons(ETH_P_TEB)) {
2427                 struct ethhdr *eth;
2428
2429                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2430                         return 0;
2431
2432                 eth = (struct ethhdr *)skb_mac_header(skb);
2433                 type = eth->h_proto;
2434         }
2435
2436         return __vlan_get_protocol(skb, type, depth);
2437 }
2438
2439 /**
2440  *      skb_mac_gso_segment - mac layer segmentation handler.
2441  *      @skb: buffer to segment
2442  *      @features: features for the output path (see dev->features)
2443  */
2444 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2445                                     netdev_features_t features)
2446 {
2447         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2448         struct packet_offload *ptype;
2449         int vlan_depth = skb->mac_len;
2450         __be16 type = skb_network_protocol(skb, &vlan_depth);
2451
2452         if (unlikely(!type))
2453                 return ERR_PTR(-EINVAL);
2454
2455         __skb_pull(skb, vlan_depth);
2456
2457         rcu_read_lock();
2458         list_for_each_entry_rcu(ptype, &offload_base, list) {
2459                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2460                         segs = ptype->callbacks.gso_segment(skb, features);
2461                         break;
2462                 }
2463         }
2464         rcu_read_unlock();
2465
2466         __skb_push(skb, skb->data - skb_mac_header(skb));
2467
2468         return segs;
2469 }
2470 EXPORT_SYMBOL(skb_mac_gso_segment);
2471
2472
2473 /* openvswitch calls this on rx path, so we need a different check.
2474  */
2475 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2476 {
2477         if (tx_path)
2478                 return skb->ip_summed != CHECKSUM_PARTIAL;
2479         else
2480                 return skb->ip_summed == CHECKSUM_NONE;
2481 }
2482
2483 /**
2484  *      __skb_gso_segment - Perform segmentation on skb.
2485  *      @skb: buffer to segment
2486  *      @features: features for the output path (see dev->features)
2487  *      @tx_path: whether it is called in TX path
2488  *
2489  *      This function segments the given skb and returns a list of segments.
2490  *
2491  *      It may return NULL if the skb requires no segmentation.  This is
2492  *      only possible when GSO is used for verifying header integrity.
2493  */
2494 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2495                                   netdev_features_t features, bool tx_path)
2496 {
2497         if (unlikely(skb_needs_check(skb, tx_path))) {
2498                 int err;
2499
2500                 skb_warn_bad_offload(skb);
2501
2502                 err = skb_cow_head(skb, 0);
2503                 if (err < 0)
2504                         return ERR_PTR(err);
2505         }
2506
2507         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2508         SKB_GSO_CB(skb)->encap_level = 0;
2509
2510         skb_reset_mac_header(skb);
2511         skb_reset_mac_len(skb);
2512
2513         return skb_mac_gso_segment(skb, features);
2514 }
2515 EXPORT_SYMBOL(__skb_gso_segment);
2516
2517 /* Take action when hardware reception checksum errors are detected. */
2518 #ifdef CONFIG_BUG
2519 void netdev_rx_csum_fault(struct net_device *dev)
2520 {
2521         if (net_ratelimit()) {
2522                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2523                 dump_stack();
2524         }
2525 }
2526 EXPORT_SYMBOL(netdev_rx_csum_fault);
2527 #endif
2528
2529 /* Actually, we should eliminate this check as soon as we know, that:
2530  * 1. IOMMU is present and allows to map all the memory.
2531  * 2. No high memory really exists on this machine.
2532  */
2533
2534 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2535 {
2536 #ifdef CONFIG_HIGHMEM
2537         int i;
2538         if (!(dev->features & NETIF_F_HIGHDMA)) {
2539                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2540                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2541                         if (PageHighMem(skb_frag_page(frag)))
2542                                 return 1;
2543                 }
2544         }
2545
2546         if (PCI_DMA_BUS_IS_PHYS) {
2547                 struct device *pdev = dev->dev.parent;
2548
2549                 if (!pdev)
2550                         return 0;
2551                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2552                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2553                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2554                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2555                                 return 1;
2556                 }
2557         }
2558 #endif
2559         return 0;
2560 }
2561
2562 /* If MPLS offload request, verify we are testing hardware MPLS features
2563  * instead of standard features for the netdev.
2564  */
2565 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2566 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2567                                            netdev_features_t features,
2568                                            __be16 type)
2569 {
2570         if (eth_p_mpls(type))
2571                 features &= skb->dev->mpls_features;
2572
2573         return features;
2574 }
2575 #else
2576 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2577                                            netdev_features_t features,
2578                                            __be16 type)
2579 {
2580         return features;
2581 }
2582 #endif
2583
2584 static netdev_features_t harmonize_features(struct sk_buff *skb,
2585         netdev_features_t features)
2586 {
2587         int tmp;
2588         __be16 type;
2589
2590         type = skb_network_protocol(skb, &tmp);
2591         features = net_mpls_features(skb, features, type);
2592
2593         if (skb->ip_summed != CHECKSUM_NONE &&
2594             !can_checksum_protocol(features, type)) {
2595                 features &= ~NETIF_F_ALL_CSUM;
2596         } else if (illegal_highdma(skb->dev, skb)) {
2597                 features &= ~NETIF_F_SG;
2598         }
2599
2600         return features;
2601 }
2602
2603 netdev_features_t passthru_features_check(struct sk_buff *skb,
2604                                           struct net_device *dev,
2605                                           netdev_features_t features)
2606 {
2607         return features;
2608 }
2609 EXPORT_SYMBOL(passthru_features_check);
2610
2611 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2612                                              struct net_device *dev,
2613                                              netdev_features_t features)
2614 {
2615         return vlan_features_check(skb, features);
2616 }
2617
2618 netdev_features_t netif_skb_features(struct sk_buff *skb)
2619 {
2620         struct net_device *dev = skb->dev;
2621         netdev_features_t features = dev->features;
2622         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2623
2624         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2625                 features &= ~NETIF_F_GSO_MASK;
2626
2627         /* If encapsulation offload request, verify we are testing
2628          * hardware encapsulation features instead of standard
2629          * features for the netdev
2630          */
2631         if (skb->encapsulation)
2632                 features &= dev->hw_enc_features;
2633
2634         if (skb_vlan_tagged(skb))
2635                 features = netdev_intersect_features(features,
2636                                                      dev->vlan_features |
2637                                                      NETIF_F_HW_VLAN_CTAG_TX |
2638                                                      NETIF_F_HW_VLAN_STAG_TX);
2639
2640         if (dev->netdev_ops->ndo_features_check)
2641                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2642                                                                 features);
2643         else
2644                 features &= dflt_features_check(skb, dev, features);
2645
2646         return harmonize_features(skb, features);
2647 }
2648 EXPORT_SYMBOL(netif_skb_features);
2649
2650 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2651                     struct netdev_queue *txq, bool more)
2652 {
2653         unsigned int len;
2654         int rc;
2655
2656         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2657                 dev_queue_xmit_nit(skb, dev);
2658
2659         len = skb->len;
2660         trace_net_dev_start_xmit(skb, dev);
2661         rc = netdev_start_xmit(skb, dev, txq, more);
2662         trace_net_dev_xmit(skb, rc, dev, len);
2663
2664         return rc;
2665 }
2666
2667 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2668                                     struct netdev_queue *txq, int *ret)
2669 {
2670         struct sk_buff *skb = first;
2671         int rc = NETDEV_TX_OK;
2672
2673         while (skb) {
2674                 struct sk_buff *next = skb->next;
2675
2676                 skb->next = NULL;
2677                 rc = xmit_one(skb, dev, txq, next != NULL);
2678                 if (unlikely(!dev_xmit_complete(rc))) {
2679                         skb->next = next;
2680                         goto out;
2681                 }
2682
2683                 skb = next;
2684                 if (netif_xmit_stopped(txq) && skb) {
2685                         rc = NETDEV_TX_BUSY;
2686                         break;
2687                 }
2688         }
2689
2690 out:
2691         *ret = rc;
2692         return skb;
2693 }
2694
2695 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2696                                           netdev_features_t features)
2697 {
2698         if (skb_vlan_tag_present(skb) &&
2699             !vlan_hw_offload_capable(features, skb->vlan_proto))
2700                 skb = __vlan_hwaccel_push_inside(skb);
2701         return skb;
2702 }
2703
2704 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2705 {
2706         netdev_features_t features;
2707
2708         if (skb->next)
2709                 return skb;
2710
2711         features = netif_skb_features(skb);
2712         skb = validate_xmit_vlan(skb, features);
2713         if (unlikely(!skb))
2714                 goto out_null;
2715
2716         if (netif_needs_gso(skb, features)) {
2717                 struct sk_buff *segs;
2718
2719                 segs = skb_gso_segment(skb, features);
2720                 if (IS_ERR(segs)) {
2721                         goto out_kfree_skb;
2722                 } else if (segs) {
2723                         consume_skb(skb);
2724                         skb = segs;
2725                 }
2726         } else {
2727                 if (skb_needs_linearize(skb, features) &&
2728                     __skb_linearize(skb))
2729                         goto out_kfree_skb;
2730
2731                 /* If packet is not checksummed and device does not
2732                  * support checksumming for this protocol, complete
2733                  * checksumming here.
2734                  */
2735                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2736                         if (skb->encapsulation)
2737                                 skb_set_inner_transport_header(skb,
2738                                                                skb_checksum_start_offset(skb));
2739                         else
2740                                 skb_set_transport_header(skb,
2741                                                          skb_checksum_start_offset(skb));
2742                         if (!(features & NETIF_F_ALL_CSUM) &&
2743                             skb_checksum_help(skb))
2744                                 goto out_kfree_skb;
2745                 }
2746         }
2747
2748         return skb;
2749
2750 out_kfree_skb:
2751         kfree_skb(skb);
2752 out_null:
2753         return NULL;
2754 }
2755
2756 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2757 {
2758         struct sk_buff *next, *head = NULL, *tail;
2759
2760         for (; skb != NULL; skb = next) {
2761                 next = skb->next;
2762                 skb->next = NULL;
2763
2764                 /* in case skb wont be segmented, point to itself */
2765                 skb->prev = skb;
2766
2767                 skb = validate_xmit_skb(skb, dev);
2768                 if (!skb)
2769                         continue;
2770
2771                 if (!head)
2772                         head = skb;
2773                 else
2774                         tail->next = skb;
2775                 /* If skb was segmented, skb->prev points to
2776                  * the last segment. If not, it still contains skb.
2777                  */
2778                 tail = skb->prev;
2779         }
2780         return head;
2781 }
2782
2783 static void qdisc_pkt_len_init(struct sk_buff *skb)
2784 {
2785         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2786
2787         qdisc_skb_cb(skb)->pkt_len = skb->len;
2788
2789         /* To get more precise estimation of bytes sent on wire,
2790          * we add to pkt_len the headers size of all segments
2791          */
2792         if (shinfo->gso_size)  {
2793                 unsigned int hdr_len;
2794                 u16 gso_segs = shinfo->gso_segs;
2795
2796                 /* mac layer + network layer */
2797                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2798
2799                 /* + transport layer */
2800                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2801                         hdr_len += tcp_hdrlen(skb);
2802                 else
2803                         hdr_len += sizeof(struct udphdr);
2804
2805                 if (shinfo->gso_type & SKB_GSO_DODGY)
2806                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2807                                                 shinfo->gso_size);
2808
2809                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2810         }
2811 }
2812
2813 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2814                                  struct net_device *dev,
2815                                  struct netdev_queue *txq)
2816 {
2817         spinlock_t *root_lock = qdisc_lock(q);
2818         bool contended;
2819         int rc;
2820
2821         qdisc_pkt_len_init(skb);
2822         qdisc_calculate_pkt_len(skb, q);
2823         /*
2824          * Heuristic to force contended enqueues to serialize on a
2825          * separate lock before trying to get qdisc main lock.
2826          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2827          * often and dequeue packets faster.
2828          */
2829         contended = qdisc_is_running(q);
2830         if (unlikely(contended))
2831                 spin_lock(&q->busylock);
2832
2833         spin_lock(root_lock);
2834         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2835                 kfree_skb(skb);
2836                 rc = NET_XMIT_DROP;
2837         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2838                    qdisc_run_begin(q)) {
2839                 /*
2840                  * This is a work-conserving queue; there are no old skbs
2841                  * waiting to be sent out; and the qdisc is not running -
2842                  * xmit the skb directly.
2843                  */
2844
2845                 qdisc_bstats_update(q, skb);
2846
2847                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2848                         if (unlikely(contended)) {
2849                                 spin_unlock(&q->busylock);
2850                                 contended = false;
2851                         }
2852                         __qdisc_run(q);
2853                 } else
2854                         qdisc_run_end(q);
2855
2856                 rc = NET_XMIT_SUCCESS;
2857         } else {
2858                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2859                 if (qdisc_run_begin(q)) {
2860                         if (unlikely(contended)) {
2861                                 spin_unlock(&q->busylock);
2862                                 contended = false;
2863                         }
2864                         __qdisc_run(q);
2865                 }
2866         }
2867         spin_unlock(root_lock);
2868         if (unlikely(contended))
2869                 spin_unlock(&q->busylock);
2870         return rc;
2871 }
2872
2873 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2874 static void skb_update_prio(struct sk_buff *skb)
2875 {
2876         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2877
2878         if (!skb->priority && skb->sk && map) {
2879                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2880
2881                 if (prioidx < map->priomap_len)
2882                         skb->priority = map->priomap[prioidx];
2883         }
2884 }
2885 #else
2886 #define skb_update_prio(skb)
2887 #endif
2888
2889 DEFINE_PER_CPU(int, xmit_recursion);
2890 EXPORT_SYMBOL(xmit_recursion);
2891
2892 #define RECURSION_LIMIT 10
2893
2894 /**
2895  *      dev_loopback_xmit - loop back @skb
2896  *      @skb: buffer to transmit
2897  */
2898 int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
2899 {
2900         skb_reset_mac_header(skb);
2901         __skb_pull(skb, skb_network_offset(skb));
2902         skb->pkt_type = PACKET_LOOPBACK;
2903         skb->ip_summed = CHECKSUM_UNNECESSARY;
2904         WARN_ON(!skb_dst(skb));
2905         skb_dst_force(skb);
2906         netif_rx_ni(skb);
2907         return 0;
2908 }
2909 EXPORT_SYMBOL(dev_loopback_xmit);
2910
2911 /**
2912  *      __dev_queue_xmit - transmit a buffer
2913  *      @skb: buffer to transmit
2914  *      @accel_priv: private data used for L2 forwarding offload
2915  *
2916  *      Queue a buffer for transmission to a network device. The caller must
2917  *      have set the device and priority and built the buffer before calling
2918  *      this function. The function can be called from an interrupt.
2919  *
2920  *      A negative errno code is returned on a failure. A success does not
2921  *      guarantee the frame will be transmitted as it may be dropped due
2922  *      to congestion or traffic shaping.
2923  *
2924  * -----------------------------------------------------------------------------------
2925  *      I notice this method can also return errors from the queue disciplines,
2926  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2927  *      be positive.
2928  *
2929  *      Regardless of the return value, the skb is consumed, so it is currently
2930  *      difficult to retry a send to this method.  (You can bump the ref count
2931  *      before sending to hold a reference for retry if you are careful.)
2932  *
2933  *      When calling this method, interrupts MUST be enabled.  This is because
2934  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2935  *          --BLG
2936  */
2937 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2938 {
2939         struct net_device *dev = skb->dev;
2940         struct netdev_queue *txq;
2941         struct Qdisc *q;
2942         int rc = -ENOMEM;
2943
2944         skb_reset_mac_header(skb);
2945
2946         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2947                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2948
2949         /* Disable soft irqs for various locks below. Also
2950          * stops preemption for RCU.
2951          */
2952         rcu_read_lock_bh();
2953
2954         skb_update_prio(skb);
2955
2956         /* If device/qdisc don't need skb->dst, release it right now while
2957          * its hot in this cpu cache.
2958          */
2959         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2960                 skb_dst_drop(skb);
2961         else
2962                 skb_dst_force(skb);
2963
2964         txq = netdev_pick_tx(dev, skb, accel_priv);
2965         q = rcu_dereference_bh(txq->qdisc);
2966
2967 #ifdef CONFIG_NET_CLS_ACT
2968         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2969 #endif
2970         trace_net_dev_queue(skb);
2971         if (q->enqueue) {
2972                 rc = __dev_xmit_skb(skb, q, dev, txq);
2973                 goto out;
2974         }
2975
2976         /* The device has no queue. Common case for software devices:
2977            loopback, all the sorts of tunnels...
2978
2979            Really, it is unlikely that netif_tx_lock protection is necessary
2980            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2981            counters.)
2982            However, it is possible, that they rely on protection
2983            made by us here.
2984
2985            Check this and shot the lock. It is not prone from deadlocks.
2986            Either shot noqueue qdisc, it is even simpler 8)
2987          */
2988         if (dev->flags & IFF_UP) {
2989                 int cpu = smp_processor_id(); /* ok because BHs are off */
2990
2991                 if (txq->xmit_lock_owner != cpu) {
2992
2993                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2994                                 goto recursion_alert;
2995
2996                         skb = validate_xmit_skb(skb, dev);
2997                         if (!skb)
2998                                 goto drop;
2999
3000                         HARD_TX_LOCK(dev, txq, cpu);
3001
3002                         if (!netif_xmit_stopped(txq)) {
3003                                 __this_cpu_inc(xmit_recursion);
3004                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3005                                 __this_cpu_dec(xmit_recursion);
3006                                 if (dev_xmit_complete(rc)) {
3007                                         HARD_TX_UNLOCK(dev, txq);
3008                                         goto out;
3009                                 }
3010                         }
3011                         HARD_TX_UNLOCK(dev, txq);
3012                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3013                                              dev->name);
3014                 } else {
3015                         /* Recursion is detected! It is possible,
3016                          * unfortunately
3017                          */
3018 recursion_alert:
3019                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3020                                              dev->name);
3021                 }
3022         }
3023
3024         rc = -ENETDOWN;
3025 drop:
3026         rcu_read_unlock_bh();
3027
3028         atomic_long_inc(&dev->tx_dropped);
3029         kfree_skb_list(skb);
3030         return rc;
3031 out:
3032         rcu_read_unlock_bh();
3033         return rc;
3034 }
3035
3036 int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
3037 {
3038         return __dev_queue_xmit(skb, NULL);
3039 }
3040 EXPORT_SYMBOL(dev_queue_xmit_sk);
3041
3042 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3043 {
3044         return __dev_queue_xmit(skb, accel_priv);
3045 }
3046 EXPORT_SYMBOL(dev_queue_xmit_accel);
3047
3048
3049 /*=======================================================================
3050                         Receiver routines
3051   =======================================================================*/
3052
3053 int netdev_max_backlog __read_mostly = 1000;
3054 EXPORT_SYMBOL(netdev_max_backlog);
3055
3056 int netdev_tstamp_prequeue __read_mostly = 1;
3057 int netdev_budget __read_mostly = 300;
3058 int weight_p __read_mostly = 64;            /* old backlog weight */
3059
3060 /* Called with irq disabled */
3061 static inline void ____napi_schedule(struct softnet_data *sd,
3062                                      struct napi_struct *napi)
3063 {
3064         list_add_tail(&napi->poll_list, &sd->poll_list);
3065         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3066 }
3067
3068 #ifdef CONFIG_RPS
3069
3070 /* One global table that all flow-based protocols share. */
3071 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3072 EXPORT_SYMBOL(rps_sock_flow_table);
3073 u32 rps_cpu_mask __read_mostly;
3074 EXPORT_SYMBOL(rps_cpu_mask);
3075
3076 struct static_key rps_needed __read_mostly;
3077
3078 static struct rps_dev_flow *
3079 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3080             struct rps_dev_flow *rflow, u16 next_cpu)
3081 {
3082         if (next_cpu < nr_cpu_ids) {
3083 #ifdef CONFIG_RFS_ACCEL
3084                 struct netdev_rx_queue *rxqueue;
3085                 struct rps_dev_flow_table *flow_table;
3086                 struct rps_dev_flow *old_rflow;
3087                 u32 flow_id;
3088                 u16 rxq_index;
3089                 int rc;
3090
3091                 /* Should we steer this flow to a different hardware queue? */
3092                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3093                     !(dev->features & NETIF_F_NTUPLE))
3094                         goto out;
3095                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3096                 if (rxq_index == skb_get_rx_queue(skb))
3097                         goto out;
3098
3099                 rxqueue = dev->_rx + rxq_index;
3100                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3101                 if (!flow_table)
3102                         goto out;
3103                 flow_id = skb_get_hash(skb) & flow_table->mask;
3104                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3105                                                         rxq_index, flow_id);
3106                 if (rc < 0)
3107                         goto out;
3108                 old_rflow = rflow;
3109                 rflow = &flow_table->flows[flow_id];
3110                 rflow->filter = rc;
3111                 if (old_rflow->filter == rflow->filter)
3112                         old_rflow->filter = RPS_NO_FILTER;
3113         out:
3114 #endif
3115                 rflow->last_qtail =
3116                         per_cpu(softnet_data, next_cpu).input_queue_head;
3117         }
3118
3119         rflow->cpu = next_cpu;
3120         return rflow;
3121 }
3122
3123 /*
3124  * get_rps_cpu is called from netif_receive_skb and returns the target
3125  * CPU from the RPS map of the receiving queue for a given skb.
3126  * rcu_read_lock must be held on entry.
3127  */
3128 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3129                        struct rps_dev_flow **rflowp)
3130 {
3131         const struct rps_sock_flow_table *sock_flow_table;
3132         struct netdev_rx_queue *rxqueue = dev->_rx;
3133         struct rps_dev_flow_table *flow_table;
3134         struct rps_map *map;
3135         int cpu = -1;
3136         u32 tcpu;
3137         u32 hash;
3138
3139         if (skb_rx_queue_recorded(skb)) {
3140                 u16 index = skb_get_rx_queue(skb);
3141
3142                 if (unlikely(index >= dev->real_num_rx_queues)) {
3143                         WARN_ONCE(dev->real_num_rx_queues > 1,
3144                                   "%s received packet on queue %u, but number "
3145                                   "of RX queues is %u\n",
3146                                   dev->name, index, dev->real_num_rx_queues);
3147                         goto done;
3148                 }
3149                 rxqueue += index;
3150         }
3151
3152         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3153
3154         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3155         map = rcu_dereference(rxqueue->rps_map);
3156         if (!flow_table && !map)
3157                 goto done;
3158
3159         skb_reset_network_header(skb);
3160         hash = skb_get_hash(skb);
3161         if (!hash)
3162                 goto done;
3163
3164         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3165         if (flow_table && sock_flow_table) {
3166                 struct rps_dev_flow *rflow;
3167                 u32 next_cpu;
3168                 u32 ident;
3169
3170                 /* First check into global flow table if there is a match */
3171                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3172                 if ((ident ^ hash) & ~rps_cpu_mask)
3173                         goto try_rps;
3174
3175                 next_cpu = ident & rps_cpu_mask;
3176
3177                 /* OK, now we know there is a match,
3178                  * we can look at the local (per receive queue) flow table
3179                  */
3180                 rflow = &flow_table->flows[hash & flow_table->mask];
3181                 tcpu = rflow->cpu;
3182
3183                 /*
3184                  * If the desired CPU (where last recvmsg was done) is
3185                  * different from current CPU (one in the rx-queue flow
3186                  * table entry), switch if one of the following holds:
3187                  *   - Current CPU is unset (>= nr_cpu_ids).
3188                  *   - Current CPU is offline.
3189                  *   - The current CPU's queue tail has advanced beyond the
3190                  *     last packet that was enqueued using this table entry.
3191                  *     This guarantees that all previous packets for the flow
3192                  *     have been dequeued, thus preserving in order delivery.
3193                  */
3194                 if (unlikely(tcpu != next_cpu) &&
3195                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3196                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3197                       rflow->last_qtail)) >= 0)) {
3198                         tcpu = next_cpu;
3199                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3200                 }
3201
3202                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3203                         *rflowp = rflow;
3204                         cpu = tcpu;
3205                         goto done;
3206                 }
3207         }
3208
3209 try_rps:
3210
3211         if (map) {
3212                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3213                 if (cpu_online(tcpu)) {
3214                         cpu = tcpu;
3215                         goto done;
3216                 }
3217         }
3218
3219 done:
3220         return cpu;
3221 }
3222
3223 #ifdef CONFIG_RFS_ACCEL
3224
3225 /**
3226  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3227  * @dev: Device on which the filter was set
3228  * @rxq_index: RX queue index
3229  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3230  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3231  *
3232  * Drivers that implement ndo_rx_flow_steer() should periodically call
3233  * this function for each installed filter and remove the filters for
3234  * which it returns %true.
3235  */
3236 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3237                          u32 flow_id, u16 filter_id)
3238 {
3239         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3240         struct rps_dev_flow_table *flow_table;
3241         struct rps_dev_flow *rflow;
3242         bool expire = true;
3243         unsigned int cpu;
3244
3245         rcu_read_lock();
3246         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3247         if (flow_table && flow_id <= flow_table->mask) {
3248                 rflow = &flow_table->flows[flow_id];
3249                 cpu = ACCESS_ONCE(rflow->cpu);
3250                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3251                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3252                            rflow->last_qtail) <
3253                      (int)(10 * flow_table->mask)))
3254                         expire = false;
3255         }
3256         rcu_read_unlock();
3257         return expire;
3258 }
3259 EXPORT_SYMBOL(rps_may_expire_flow);
3260
3261 #endif /* CONFIG_RFS_ACCEL */
3262
3263 /* Called from hardirq (IPI) context */
3264 static void rps_trigger_softirq(void *data)
3265 {
3266         struct softnet_data *sd = data;
3267
3268         ____napi_schedule(sd, &sd->backlog);
3269         sd->received_rps++;
3270 }
3271
3272 #endif /* CONFIG_RPS */
3273
3274 /*
3275  * Check if this softnet_data structure is another cpu one
3276  * If yes, queue it to our IPI list and return 1
3277  * If no, return 0
3278  */
3279 static int rps_ipi_queued(struct softnet_data *sd)
3280 {
3281 #ifdef CONFIG_RPS
3282         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3283
3284         if (sd != mysd) {
3285                 sd->rps_ipi_next = mysd->rps_ipi_list;
3286                 mysd->rps_ipi_list = sd;
3287
3288                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3289                 return 1;
3290         }
3291 #endif /* CONFIG_RPS */
3292         return 0;
3293 }
3294
3295 #ifdef CONFIG_NET_FLOW_LIMIT
3296 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3297 #endif
3298
3299 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3300 {
3301 #ifdef CONFIG_NET_FLOW_LIMIT
3302         struct sd_flow_limit *fl;
3303         struct softnet_data *sd;
3304         unsigned int old_flow, new_flow;
3305
3306         if (qlen < (netdev_max_backlog >> 1))
3307                 return false;
3308
3309         sd = this_cpu_ptr(&softnet_data);
3310
3311         rcu_read_lock();
3312         fl = rcu_dereference(sd->flow_limit);
3313         if (fl) {
3314                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3315                 old_flow = fl->history[fl->history_head];
3316                 fl->history[fl->history_head] = new_flow;
3317
3318                 fl->history_head++;
3319                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3320
3321                 if (likely(fl->buckets[old_flow]))
3322                         fl->buckets[old_flow]--;
3323
3324                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3325                         fl->count++;
3326                         rcu_read_unlock();
3327                         return true;
3328                 }
3329         }
3330         rcu_read_unlock();
3331 #endif
3332         return false;
3333 }
3334
3335 /*
3336  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3337  * queue (may be a remote CPU queue).
3338  */
3339 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3340                               unsigned int *qtail)
3341 {
3342         struct softnet_data *sd;
3343         unsigned long flags;
3344         unsigned int qlen;
3345
3346         sd = &per_cpu(softnet_data, cpu);
3347
3348         local_irq_save(flags);
3349
3350         rps_lock(sd);
3351         qlen = skb_queue_len(&sd->input_pkt_queue);
3352         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3353                 if (qlen) {
3354 enqueue:
3355                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3356                         input_queue_tail_incr_save(sd, qtail);
3357                         rps_unlock(sd);
3358                         local_irq_restore(flags);
3359                         return NET_RX_SUCCESS;
3360                 }
3361
3362                 /* Schedule NAPI for backlog device
3363                  * We can use non atomic operation since we own the queue lock
3364                  */
3365                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3366                         if (!rps_ipi_queued(sd))
3367                                 ____napi_schedule(sd, &sd->backlog);
3368                 }
3369                 goto enqueue;
3370         }
3371
3372         sd->dropped++;
3373         rps_unlock(sd);
3374
3375         local_irq_restore(flags);
3376
3377         atomic_long_inc(&skb->dev->rx_dropped);
3378         kfree_skb(skb);
3379         return NET_RX_DROP;
3380 }
3381
3382 static int netif_rx_internal(struct sk_buff *skb)
3383 {
3384         int ret;
3385
3386         net_timestamp_check(netdev_tstamp_prequeue, skb);
3387
3388         trace_netif_rx(skb);
3389 #ifdef CONFIG_RPS
3390         if (static_key_false(&rps_needed)) {
3391                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3392                 int cpu;
3393
3394                 preempt_disable();
3395                 rcu_read_lock();
3396
3397                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3398                 if (cpu < 0)
3399                         cpu = smp_processor_id();
3400
3401                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3402
3403                 rcu_read_unlock();
3404                 preempt_enable();
3405         } else
3406 #endif
3407         {
3408                 unsigned int qtail;
3409                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3410                 put_cpu();
3411         }
3412         return ret;
3413 }
3414
3415 /**
3416  *      netif_rx        -       post buffer to the network code
3417  *      @skb: buffer to post
3418  *
3419  *      This function receives a packet from a device driver and queues it for
3420  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3421  *      may be dropped during processing for congestion control or by the
3422  *      protocol layers.
3423  *
3424  *      return values:
3425  *      NET_RX_SUCCESS  (no congestion)
3426  *      NET_RX_DROP     (packet was dropped)
3427  *
3428  */
3429
3430 int netif_rx(struct sk_buff *skb)
3431 {
3432         trace_netif_rx_entry(skb);
3433
3434         return netif_rx_internal(skb);
3435 }
3436 EXPORT_SYMBOL(netif_rx);
3437
3438 int netif_rx_ni(struct sk_buff *skb)
3439 {
3440         int err;
3441
3442         trace_netif_rx_ni_entry(skb);
3443
3444         preempt_disable();
3445         err = netif_rx_internal(skb);
3446         if (local_softirq_pending())
3447                 do_softirq();
3448         preempt_enable();
3449
3450         return err;
3451 }
3452 EXPORT_SYMBOL(netif_rx_ni);
3453
3454 static void net_tx_action(struct softirq_action *h)
3455 {
3456         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3457
3458         if (sd->completion_queue) {
3459                 struct sk_buff *clist;
3460
3461                 local_irq_disable();
3462                 clist = sd->completion_queue;
3463                 sd->completion_queue = NULL;
3464                 local_irq_enable();
3465
3466                 while (clist) {
3467                         struct sk_buff *skb = clist;
3468                         clist = clist->next;
3469
3470                         WARN_ON(atomic_read(&skb->users));
3471                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3472                                 trace_consume_skb(skb);
3473                         else
3474                                 trace_kfree_skb(skb, net_tx_action);
3475                         __kfree_skb(skb);
3476                 }
3477         }
3478
3479         if (sd->output_queue) {
3480                 struct Qdisc *head;
3481
3482                 local_irq_disable();
3483                 head = sd->output_queue;
3484                 sd->output_queue = NULL;
3485                 sd->output_queue_tailp = &sd->output_queue;
3486                 local_irq_enable();
3487
3488                 while (head) {
3489                         struct Qdisc *q = head;
3490                         spinlock_t *root_lock;
3491
3492                         head = head->next_sched;
3493
3494                         root_lock = qdisc_lock(q);
3495                         if (spin_trylock(root_lock)) {
3496                                 smp_mb__before_atomic();
3497                                 clear_bit(__QDISC_STATE_SCHED,
3498                                           &q->state);
3499                                 qdisc_run(q);
3500                                 spin_unlock(root_lock);
3501                         } else {
3502                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3503                                               &q->state)) {
3504                                         __netif_reschedule(q);
3505                                 } else {
3506                                         smp_mb__before_atomic();
3507                                         clear_bit(__QDISC_STATE_SCHED,
3508                                                   &q->state);
3509                                 }
3510                         }
3511                 }
3512         }
3513 }
3514
3515 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3516     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3517 /* This hook is defined here for ATM LANE */
3518 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3519                              unsigned char *addr) __read_mostly;
3520 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3521 #endif
3522
3523 #ifdef CONFIG_NET_CLS_ACT
3524 /* TODO: Maybe we should just force sch_ingress to be compiled in
3525  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3526  * a compare and 2 stores extra right now if we dont have it on
3527  * but have CONFIG_NET_CLS_ACT
3528  * NOTE: This doesn't stop any functionality; if you dont have
3529  * the ingress scheduler, you just can't add policies on ingress.
3530  *
3531  */
3532 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3533 {
3534         struct net_device *dev = skb->dev;
3535         u32 ttl = G_TC_RTTL(skb->tc_verd);
3536         int result = TC_ACT_OK;
3537         struct Qdisc *q;
3538
3539         if (unlikely(MAX_RED_LOOP < ttl++)) {
3540                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3541                                      skb->skb_iif, dev->ifindex);
3542                 return TC_ACT_SHOT;
3543         }
3544
3545         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3546         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3547
3548         q = rcu_dereference(rxq->qdisc);
3549         if (q != &noop_qdisc) {
3550                 spin_lock(qdisc_lock(q));
3551                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3552                         result = qdisc_enqueue_root(skb, q);
3553                 spin_unlock(qdisc_lock(q));
3554         }
3555
3556         return result;
3557 }
3558
3559 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3560                                          struct packet_type **pt_prev,
3561                                          int *ret, struct net_device *orig_dev)
3562 {
3563         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3564
3565         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3566                 return skb;
3567
3568         if (*pt_prev) {
3569                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3570                 *pt_prev = NULL;
3571         }
3572
3573         switch (ing_filter(skb, rxq)) {
3574         case TC_ACT_SHOT:
3575         case TC_ACT_STOLEN:
3576                 kfree_skb(skb);
3577                 return NULL;
3578         }
3579
3580         return skb;
3581 }
3582 #endif
3583
3584 /**
3585  *      netdev_rx_handler_register - register receive handler
3586  *      @dev: device to register a handler for
3587  *      @rx_handler: receive handler to register
3588  *      @rx_handler_data: data pointer that is used by rx handler
3589  *
3590  *      Register a receive handler for a device. This handler will then be
3591  *      called from __netif_receive_skb. A negative errno code is returned
3592  *      on a failure.
3593  *
3594  *      The caller must hold the rtnl_mutex.
3595  *
3596  *      For a general description of rx_handler, see enum rx_handler_result.
3597  */
3598 int netdev_rx_handler_register(struct net_device *dev,
3599                                rx_handler_func_t *rx_handler,
3600                                void *rx_handler_data)
3601 {
3602         ASSERT_RTNL();
3603
3604         if (dev->rx_handler)
3605                 return -EBUSY;
3606
3607         /* Note: rx_handler_data must be set before rx_handler */
3608         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3609         rcu_assign_pointer(dev->rx_handler, rx_handler);
3610
3611         return 0;
3612 }
3613 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3614
3615 /**
3616  *      netdev_rx_handler_unregister - unregister receive handler
3617  *      @dev: device to unregister a handler from
3618  *
3619  *      Unregister a receive handler from a device.
3620  *
3621  *      The caller must hold the rtnl_mutex.
3622  */
3623 void netdev_rx_handler_unregister(struct net_device *dev)
3624 {
3625
3626         ASSERT_RTNL();
3627         RCU_INIT_POINTER(dev->rx_handler, NULL);
3628         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3629          * section has a guarantee to see a non NULL rx_handler_data
3630          * as well.
3631          */
3632         synchronize_net();
3633         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3634 }
3635 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3636
3637 /*
3638  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3639  * the special handling of PFMEMALLOC skbs.
3640  */
3641 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3642 {
3643         switch (skb->protocol) {
3644         case htons(ETH_P_ARP):
3645         case htons(ETH_P_IP):
3646         case htons(ETH_P_IPV6):
3647         case htons(ETH_P_8021Q):
3648         case htons(ETH_P_8021AD):
3649                 return true;
3650         default:
3651                 return false;
3652         }
3653 }
3654
3655 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3656 {
3657         struct packet_type *ptype, *pt_prev;
3658         rx_handler_func_t *rx_handler;
3659         struct net_device *orig_dev;
3660         bool deliver_exact = false;
3661         int ret = NET_RX_DROP;
3662         __be16 type;
3663
3664         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3665
3666         trace_netif_receive_skb(skb);
3667
3668         orig_dev = skb->dev;
3669
3670         skb_reset_network_header(skb);
3671         if (!skb_transport_header_was_set(skb))
3672                 skb_reset_transport_header(skb);
3673         skb_reset_mac_len(skb);
3674
3675         pt_prev = NULL;
3676
3677         rcu_read_lock();
3678
3679 another_round:
3680         skb->skb_iif = skb->dev->ifindex;
3681
3682         __this_cpu_inc(softnet_data.processed);
3683
3684         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3685             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3686                 skb = skb_vlan_untag(skb);
3687                 if (unlikely(!skb))
3688                         goto unlock;
3689         }
3690
3691 #ifdef CONFIG_NET_CLS_ACT
3692         if (skb->tc_verd & TC_NCLS) {
3693                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3694                 goto ncls;
3695         }
3696 #endif
3697
3698         if (pfmemalloc)
3699                 goto skip_taps;
3700
3701         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3702                 if (pt_prev)
3703                         ret = deliver_skb(skb, pt_prev, orig_dev);
3704                 pt_prev = ptype;
3705         }
3706
3707         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3708                 if (pt_prev)
3709                         ret = deliver_skb(skb, pt_prev, orig_dev);
3710                 pt_prev = ptype;
3711         }
3712
3713 skip_taps:
3714 #ifdef CONFIG_NET_CLS_ACT
3715         if (static_key_false(&ingress_needed)) {
3716                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3717                 if (!skb)
3718                         goto unlock;
3719         }
3720
3721         skb->tc_verd = 0;
3722 ncls:
3723 #endif
3724         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3725                 goto drop;
3726
3727         if (skb_vlan_tag_present(skb)) {
3728                 if (pt_prev) {
3729                         ret = deliver_skb(skb, pt_prev, orig_dev);
3730                         pt_prev = NULL;
3731                 }
3732                 if (vlan_do_receive(&skb))
3733                         goto another_round;
3734                 else if (unlikely(!skb))
3735                         goto unlock;
3736         }
3737
3738         rx_handler = rcu_dereference(skb->dev->rx_handler);
3739         if (rx_handler) {
3740                 if (pt_prev) {
3741                         ret = deliver_skb(skb, pt_prev, orig_dev);
3742                         pt_prev = NULL;
3743                 }
3744                 switch (rx_handler(&skb)) {
3745                 case RX_HANDLER_CONSUMED:
3746                         ret = NET_RX_SUCCESS;
3747                         goto unlock;
3748                 case RX_HANDLER_ANOTHER:
3749                         goto another_round;
3750                 case RX_HANDLER_EXACT:
3751                         deliver_exact = true;
3752                 case RX_HANDLER_PASS:
3753                         break;
3754                 default:
3755                         BUG();
3756                 }
3757         }
3758
3759         if (unlikely(skb_vlan_tag_present(skb))) {
3760                 if (skb_vlan_tag_get_id(skb))
3761                         skb->pkt_type = PACKET_OTHERHOST;
3762                 /* Note: we might in the future use prio bits
3763                  * and set skb->priority like in vlan_do_receive()
3764                  * For the time being, just ignore Priority Code Point
3765                  */
3766                 skb->vlan_tci = 0;
3767         }
3768
3769         type = skb->protocol;
3770
3771         /* deliver only exact match when indicated */
3772         if (likely(!deliver_exact)) {
3773                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3774                                        &ptype_base[ntohs(type) &
3775                                                    PTYPE_HASH_MASK]);
3776         }
3777
3778         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3779                                &orig_dev->ptype_specific);
3780
3781         if (unlikely(skb->dev != orig_dev)) {
3782                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3783                                        &skb->dev->ptype_specific);
3784         }
3785
3786         if (pt_prev) {
3787                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3788                         goto drop;
3789                 else
3790                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3791         } else {
3792 drop:
3793                 atomic_long_inc(&skb->dev->rx_dropped);
3794                 kfree_skb(skb);
3795                 /* Jamal, now you will not able to escape explaining
3796                  * me how you were going to use this. :-)
3797                  */
3798                 ret = NET_RX_DROP;
3799         }
3800
3801 unlock:
3802         rcu_read_unlock();
3803         return ret;
3804 }
3805
3806 static int __netif_receive_skb(struct sk_buff *skb)
3807 {
3808         int ret;
3809
3810         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3811                 unsigned long pflags = current->flags;
3812
3813                 /*
3814                  * PFMEMALLOC skbs are special, they should
3815                  * - be delivered to SOCK_MEMALLOC sockets only
3816                  * - stay away from userspace
3817                  * - have bounded memory usage
3818                  *
3819                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3820                  * context down to all allocation sites.
3821                  */
3822                 current->flags |= PF_MEMALLOC;
3823                 ret = __netif_receive_skb_core(skb, true);
3824                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3825         } else
3826                 ret = __netif_receive_skb_core(skb, false);
3827
3828         return ret;
3829 }
3830
3831 static int netif_receive_skb_internal(struct sk_buff *skb)
3832 {
3833         net_timestamp_check(netdev_tstamp_prequeue, skb);
3834
3835         if (skb_defer_rx_timestamp(skb))
3836                 return NET_RX_SUCCESS;
3837
3838 #ifdef CONFIG_RPS
3839         if (static_key_false(&rps_needed)) {
3840                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3841                 int cpu, ret;
3842
3843                 rcu_read_lock();
3844
3845                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3846
3847                 if (cpu >= 0) {
3848                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3849                         rcu_read_unlock();
3850                         return ret;
3851                 }
3852                 rcu_read_unlock();
3853         }
3854 #endif
3855         return __netif_receive_skb(skb);
3856 }
3857
3858 /**
3859  *      netif_receive_skb - process receive buffer from network
3860  *      @skb: buffer to process
3861  *
3862  *      netif_receive_skb() is the main receive data processing function.
3863  *      It always succeeds. The buffer may be dropped during processing
3864  *      for congestion control or by the protocol layers.
3865  *
3866  *      This function may only be called from softirq context and interrupts
3867  *      should be enabled.
3868  *
3869  *      Return values (usually ignored):
3870  *      NET_RX_SUCCESS: no congestion
3871  *      NET_RX_DROP: packet was dropped
3872  */
3873 int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
3874 {
3875         trace_netif_receive_skb_entry(skb);
3876
3877         return netif_receive_skb_internal(skb);
3878 }
3879 EXPORT_SYMBOL(netif_receive_skb_sk);
3880
3881 /* Network device is going away, flush any packets still pending
3882  * Called with irqs disabled.
3883  */
3884 static void flush_backlog(void *arg)
3885 {
3886         struct net_device *dev = arg;
3887         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3888         struct sk_buff *skb, *tmp;
3889
3890         rps_lock(sd);
3891         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3892                 if (skb->dev == dev) {
3893                         __skb_unlink(skb, &sd->input_pkt_queue);
3894                         kfree_skb(skb);
3895                         input_queue_head_incr(sd);
3896                 }
3897         }
3898         rps_unlock(sd);
3899
3900         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3901                 if (skb->dev == dev) {
3902                         __skb_unlink(skb, &sd->process_queue);
3903                         kfree_skb(skb);
3904                         input_queue_head_incr(sd);
3905                 }
3906         }
3907 }
3908
3909 static int napi_gro_complete(struct sk_buff *skb)
3910 {
3911         struct packet_offload *ptype;
3912         __be16 type = skb->protocol;
3913         struct list_head *head = &offload_base;
3914         int err = -ENOENT;
3915
3916         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3917
3918         if (NAPI_GRO_CB(skb)->count == 1) {
3919                 skb_shinfo(skb)->gso_size = 0;
3920                 goto out;
3921         }
3922
3923         rcu_read_lock();
3924         list_for_each_entry_rcu(ptype, head, list) {
3925                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3926                         continue;
3927
3928                 err = ptype->callbacks.gro_complete(skb, 0);
3929                 break;
3930         }
3931         rcu_read_unlock();
3932
3933         if (err) {
3934                 WARN_ON(&ptype->list == head);
3935                 kfree_skb(skb);
3936                 return NET_RX_SUCCESS;
3937         }
3938
3939 out:
3940         return netif_receive_skb_internal(skb);
3941 }
3942
3943 /* napi->gro_list contains packets ordered by age.
3944  * youngest packets at the head of it.
3945  * Complete skbs in reverse order to reduce latencies.
3946  */
3947 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3948 {
3949         struct sk_buff *skb, *prev = NULL;
3950
3951         /* scan list and build reverse chain */
3952         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3953                 skb->prev = prev;
3954                 prev = skb;
3955         }
3956
3957         for (skb = prev; skb; skb = prev) {
3958                 skb->next = NULL;
3959
3960                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3961                         return;
3962
3963                 prev = skb->prev;
3964                 napi_gro_complete(skb);
3965                 napi->gro_count--;
3966         }
3967
3968         napi->gro_list = NULL;
3969 }
3970 EXPORT_SYMBOL(napi_gro_flush);
3971
3972 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3973 {
3974         struct sk_buff *p;
3975         unsigned int maclen = skb->dev->hard_header_len;
3976         u32 hash = skb_get_hash_raw(skb);
3977
3978         for (p = napi->gro_list; p; p = p->next) {
3979                 unsigned long diffs;
3980
3981                 NAPI_GRO_CB(p)->flush = 0;
3982
3983                 if (hash != skb_get_hash_raw(p)) {
3984                         NAPI_GRO_CB(p)->same_flow = 0;
3985                         continue;
3986                 }
3987
3988                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3989                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3990                 if (maclen == ETH_HLEN)
3991                         diffs |= compare_ether_header(skb_mac_header(p),
3992                                                       skb_mac_header(skb));
3993                 else if (!diffs)
3994                         diffs = memcmp(skb_mac_header(p),
3995                                        skb_mac_header(skb),
3996                                        maclen);
3997                 NAPI_GRO_CB(p)->same_flow = !diffs;
3998         }
3999 }
4000
4001 static void skb_gro_reset_offset(struct sk_buff *skb)
4002 {
4003         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4004         const skb_frag_t *frag0 = &pinfo->frags[0];
4005
4006         NAPI_GRO_CB(skb)->data_offset = 0;
4007         NAPI_GRO_CB(skb)->frag0 = NULL;
4008         NAPI_GRO_CB(skb)->frag0_len = 0;
4009
4010         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4011             pinfo->nr_frags &&
4012             !PageHighMem(skb_frag_page(frag0))) {
4013                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4014                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4015         }
4016 }
4017
4018 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4019 {
4020         struct skb_shared_info *pinfo = skb_shinfo(skb);
4021
4022         BUG_ON(skb->end - skb->tail < grow);
4023
4024         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4025
4026         skb->data_len -= grow;
4027         skb->tail += grow;
4028
4029         pinfo->frags[0].page_offset += grow;
4030         skb_frag_size_sub(&pinfo->frags[0], grow);
4031
4032         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4033                 skb_frag_unref(skb, 0);
4034                 memmove(pinfo->frags, pinfo->frags + 1,
4035                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4036         }
4037 }
4038
4039 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4040 {
4041         struct sk_buff **pp = NULL;
4042         struct packet_offload *ptype;
4043         __be16 type = skb->protocol;
4044         struct list_head *head = &offload_base;
4045         int same_flow;
4046         enum gro_result ret;
4047         int grow;
4048
4049         if (!(skb->dev->features & NETIF_F_GRO))
4050                 goto normal;
4051
4052         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4053                 goto normal;
4054
4055         gro_list_prepare(napi, skb);
4056
4057         rcu_read_lock();
4058         list_for_each_entry_rcu(ptype, head, list) {
4059                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4060                         continue;
4061
4062                 skb_set_network_header(skb, skb_gro_offset(skb));
4063                 skb_reset_mac_len(skb);
4064                 NAPI_GRO_CB(skb)->same_flow = 0;
4065                 NAPI_GRO_CB(skb)->flush = 0;
4066                 NAPI_GRO_CB(skb)->free = 0;
4067                 NAPI_GRO_CB(skb)->udp_mark = 0;
4068                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4069
4070                 /* Setup for GRO checksum validation */
4071                 switch (skb->ip_summed) {
4072                 case CHECKSUM_COMPLETE:
4073                         NAPI_GRO_CB(skb)->csum = skb->csum;
4074                         NAPI_GRO_CB(skb)->csum_valid = 1;
4075                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4076                         break;
4077                 case CHECKSUM_UNNECESSARY:
4078                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4079                         NAPI_GRO_CB(skb)->csum_valid = 0;
4080                         break;
4081                 default:
4082                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4083                         NAPI_GRO_CB(skb)->csum_valid = 0;
4084                 }
4085
4086                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4087                 break;
4088         }
4089         rcu_read_unlock();
4090
4091         if (&ptype->list == head)
4092                 goto normal;
4093
4094         same_flow = NAPI_GRO_CB(skb)->same_flow;
4095         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4096
4097         if (pp) {
4098                 struct sk_buff *nskb = *pp;
4099
4100                 *pp = nskb->next;
4101                 nskb->next = NULL;
4102                 napi_gro_complete(nskb);
4103                 napi->gro_count--;
4104         }
4105
4106         if (same_flow)
4107                 goto ok;
4108
4109         if (NAPI_GRO_CB(skb)->flush)
4110                 goto normal;
4111
4112         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4113                 struct sk_buff *nskb = napi->gro_list;
4114
4115                 /* locate the end of the list to select the 'oldest' flow */
4116                 while (nskb->next) {
4117                         pp = &nskb->next;
4118                         nskb = *pp;
4119                 }
4120                 *pp = NULL;
4121                 nskb->next = NULL;
4122                 napi_gro_complete(nskb);
4123         } else {
4124                 napi->gro_count++;
4125         }
4126         NAPI_GRO_CB(skb)->count = 1;
4127         NAPI_GRO_CB(skb)->age = jiffies;
4128         NAPI_GRO_CB(skb)->last = skb;
4129         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4130         skb->next = napi->gro_list;
4131         napi->gro_list = skb;
4132         ret = GRO_HELD;
4133
4134 pull:
4135         grow = skb_gro_offset(skb) - skb_headlen(skb);
4136         if (grow > 0)
4137                 gro_pull_from_frag0(skb, grow);
4138 ok:
4139         return ret;
4140
4141 normal:
4142         ret = GRO_NORMAL;
4143         goto pull;
4144 }
4145
4146 struct packet_offload *gro_find_receive_by_type(__be16 type)
4147 {
4148         struct list_head *offload_head = &offload_base;
4149         struct packet_offload *ptype;
4150
4151         list_for_each_entry_rcu(ptype, offload_head, list) {
4152                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4153                         continue;
4154                 return ptype;
4155         }
4156         return NULL;
4157 }
4158 EXPORT_SYMBOL(gro_find_receive_by_type);
4159
4160 struct packet_offload *gro_find_complete_by_type(__be16 type)
4161 {
4162         struct list_head *offload_head = &offload_base;
4163         struct packet_offload *ptype;
4164
4165         list_for_each_entry_rcu(ptype, offload_head, list) {
4166                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4167                         continue;
4168                 return ptype;
4169         }
4170         return NULL;
4171 }
4172 EXPORT_SYMBOL(gro_find_complete_by_type);
4173
4174 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4175 {
4176         switch (ret) {
4177         case GRO_NORMAL:
4178                 if (netif_receive_skb_internal(skb))
4179                         ret = GRO_DROP;
4180                 break;
4181
4182         case GRO_DROP:
4183                 kfree_skb(skb);
4184                 break;
4185
4186         case GRO_MERGED_FREE:
4187                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4188                         kmem_cache_free(skbuff_head_cache, skb);
4189                 else
4190                         __kfree_skb(skb);
4191                 break;
4192
4193         case GRO_HELD:
4194         case GRO_MERGED:
4195                 break;
4196         }
4197
4198         return ret;
4199 }
4200
4201 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4202 {
4203         trace_napi_gro_receive_entry(skb);
4204
4205         skb_gro_reset_offset(skb);
4206
4207         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4208 }
4209 EXPORT_SYMBOL(napi_gro_receive);
4210
4211 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4212 {
4213         if (unlikely(skb->pfmemalloc)) {
4214                 consume_skb(skb);
4215                 return;
4216         }
4217         __skb_pull(skb, skb_headlen(skb));
4218         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4219         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4220         skb->vlan_tci = 0;
4221         skb->dev = napi->dev;
4222         skb->skb_iif = 0;
4223         skb->encapsulation = 0;
4224         skb_shinfo(skb)->gso_type = 0;
4225         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4226
4227         napi->skb = skb;
4228 }
4229
4230 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4231 {
4232         struct sk_buff *skb = napi->skb;
4233
4234         if (!skb) {
4235                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4236                 napi->skb = skb;
4237         }
4238         return skb;
4239 }
4240 EXPORT_SYMBOL(napi_get_frags);
4241
4242 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4243                                       struct sk_buff *skb,
4244                                       gro_result_t ret)
4245 {
4246         switch (ret) {
4247         case GRO_NORMAL:
4248         case GRO_HELD:
4249                 __skb_push(skb, ETH_HLEN);
4250                 skb->protocol = eth_type_trans(skb, skb->dev);
4251                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4252                         ret = GRO_DROP;
4253                 break;
4254
4255         case GRO_DROP:
4256         case GRO_MERGED_FREE:
4257                 napi_reuse_skb(napi, skb);
4258                 break;
4259
4260         case GRO_MERGED:
4261                 break;
4262         }
4263
4264         return ret;
4265 }
4266
4267 /* Upper GRO stack assumes network header starts at gro_offset=0
4268  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4269  * We copy ethernet header into skb->data to have a common layout.
4270  */
4271 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4272 {
4273         struct sk_buff *skb = napi->skb;
4274         const struct ethhdr *eth;
4275         unsigned int hlen = sizeof(*eth);
4276
4277         napi->skb = NULL;
4278
4279         skb_reset_mac_header(skb);
4280         skb_gro_reset_offset(skb);
4281
4282         eth = skb_gro_header_fast(skb, 0);
4283         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4284                 eth = skb_gro_header_slow(skb, hlen, 0);
4285                 if (unlikely(!eth)) {
4286                         napi_reuse_skb(napi, skb);
4287                         return NULL;
4288                 }
4289         } else {
4290                 gro_pull_from_frag0(skb, hlen);
4291                 NAPI_GRO_CB(skb)->frag0 += hlen;
4292                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4293         }
4294         __skb_pull(skb, hlen);
4295
4296         /*
4297          * This works because the only protocols we care about don't require
4298          * special handling.
4299          * We'll fix it up properly in napi_frags_finish()
4300          */
4301         skb->protocol = eth->h_proto;
4302
4303         return skb;
4304 }
4305
4306 gro_result_t napi_gro_frags(struct napi_struct *napi)
4307 {
4308         struct sk_buff *skb = napi_frags_skb(napi);
4309
4310         if (!skb)
4311                 return GRO_DROP;
4312
4313         trace_napi_gro_frags_entry(skb);
4314
4315         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4316 }
4317 EXPORT_SYMBOL(napi_gro_frags);
4318
4319 /* Compute the checksum from gro_offset and return the folded value
4320  * after adding in any pseudo checksum.
4321  */
4322 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4323 {
4324         __wsum wsum;
4325         __sum16 sum;
4326
4327         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4328
4329         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4330         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4331         if (likely(!sum)) {
4332                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4333                     !skb->csum_complete_sw)
4334                         netdev_rx_csum_fault(skb->dev);
4335         }
4336
4337         NAPI_GRO_CB(skb)->csum = wsum;
4338         NAPI_GRO_CB(skb)->csum_valid = 1;
4339
4340         return sum;
4341 }
4342 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4343
4344 /*
4345  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4346  * Note: called with local irq disabled, but exits with local irq enabled.
4347  */
4348 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4349 {
4350 #ifdef CONFIG_RPS
4351         struct softnet_data *remsd = sd->rps_ipi_list;
4352
4353         if (remsd) {
4354                 sd->rps_ipi_list = NULL;
4355
4356                 local_irq_enable();
4357
4358                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4359                 while (remsd) {
4360                         struct softnet_data *next = remsd->rps_ipi_next;
4361
4362                         if (cpu_online(remsd->cpu))
4363                                 smp_call_function_single_async(remsd->cpu,
4364                                                            &remsd->csd);
4365                         remsd = next;
4366                 }
4367         } else
4368 #endif
4369                 local_irq_enable();
4370 }
4371
4372 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4373 {
4374 #ifdef CONFIG_RPS
4375         return sd->rps_ipi_list != NULL;
4376 #else
4377         return false;
4378 #endif
4379 }
4380
4381 static int process_backlog(struct napi_struct *napi, int quota)
4382 {
4383         int work = 0;
4384         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4385
4386         /* Check if we have pending ipi, its better to send them now,
4387          * not waiting net_rx_action() end.
4388          */
4389         if (sd_has_rps_ipi_waiting(sd)) {
4390                 local_irq_disable();
4391                 net_rps_action_and_irq_enable(sd);
4392         }
4393
4394         napi->weight = weight_p;
4395         local_irq_disable();
4396         while (1) {
4397                 struct sk_buff *skb;
4398
4399                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4400                         local_irq_enable();
4401                         __netif_receive_skb(skb);
4402                         local_irq_disable();
4403                         input_queue_head_incr(sd);
4404                         if (++work >= quota) {
4405                                 local_irq_enable();
4406                                 return work;
4407                         }
4408                 }
4409
4410                 rps_lock(sd);
4411                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4412                         /*
4413                          * Inline a custom version of __napi_complete().
4414                          * only current cpu owns and manipulates this napi,
4415                          * and NAPI_STATE_SCHED is the only possible flag set
4416                          * on backlog.
4417                          * We can use a plain write instead of clear_bit(),
4418                          * and we dont need an smp_mb() memory barrier.
4419                          */
4420                         napi->state = 0;
4421                         rps_unlock(sd);
4422
4423                         break;
4424                 }
4425
4426                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4427                                            &sd->process_queue);
4428                 rps_unlock(sd);
4429         }
4430         local_irq_enable();
4431
4432         return work;
4433 }
4434
4435 /**
4436  * __napi_schedule - schedule for receive
4437  * @n: entry to schedule
4438  *
4439  * The entry's receive function will be scheduled to run.
4440  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4441  */
4442 void __napi_schedule(struct napi_struct *n)
4443 {
4444         unsigned long flags;
4445
4446         local_irq_save(flags);
4447         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4448         local_irq_restore(flags);
4449 }
4450 EXPORT_SYMBOL(__napi_schedule);
4451
4452 /**
4453  * __napi_schedule_irqoff - schedule for receive
4454  * @n: entry to schedule
4455  *
4456  * Variant of __napi_schedule() assuming hard irqs are masked
4457  */
4458 void __napi_schedule_irqoff(struct napi_struct *n)
4459 {
4460         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4461 }
4462 EXPORT_SYMBOL(__napi_schedule_irqoff);
4463
4464 void __napi_complete(struct napi_struct *n)
4465 {
4466         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4467
4468         list_del_init(&n->poll_list);
4469         smp_mb__before_atomic();
4470         clear_bit(NAPI_STATE_SCHED, &n->state);
4471 }
4472 EXPORT_SYMBOL(__napi_complete);
4473
4474 void napi_complete_done(struct napi_struct *n, int work_done)
4475 {
4476         unsigned long flags;
4477
4478         /*
4479          * don't let napi dequeue from the cpu poll list
4480          * just in case its running on a different cpu
4481          */
4482         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4483                 return;
4484
4485         if (n->gro_list) {
4486                 unsigned long timeout = 0;
4487
4488                 if (work_done)
4489                         timeout = n->dev->gro_flush_timeout;
4490
4491                 if (timeout)
4492                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4493                                       HRTIMER_MODE_REL_PINNED);
4494                 else
4495                         napi_gro_flush(n, false);
4496         }
4497         if (likely(list_empty(&n->poll_list))) {
4498                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4499         } else {
4500                 /* If n->poll_list is not empty, we need to mask irqs */
4501                 local_irq_save(flags);
4502                 __napi_complete(n);
4503                 local_irq_restore(flags);
4504         }
4505 }
4506 EXPORT_SYMBOL(napi_complete_done);
4507
4508 /* must be called under rcu_read_lock(), as we dont take a reference */
4509 struct napi_struct *napi_by_id(unsigned int napi_id)
4510 {
4511         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4512         struct napi_struct *napi;
4513
4514         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4515                 if (napi->napi_id == napi_id)
4516                         return napi;
4517
4518         return NULL;
4519 }
4520 EXPORT_SYMBOL_GPL(napi_by_id);
4521
4522 void napi_hash_add(struct napi_struct *napi)
4523 {
4524         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4525
4526                 spin_lock(&napi_hash_lock);
4527
4528                 /* 0 is not a valid id, we also skip an id that is taken
4529                  * we expect both events to be extremely rare
4530                  */
4531                 napi->napi_id = 0;
4532                 while (!napi->napi_id) {
4533                         napi->napi_id = ++napi_gen_id;
4534                         if (napi_by_id(napi->napi_id))
4535                                 napi->napi_id = 0;
4536                 }
4537
4538                 hlist_add_head_rcu(&napi->napi_hash_node,
4539                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4540
4541                 spin_unlock(&napi_hash_lock);
4542         }
4543 }
4544 EXPORT_SYMBOL_GPL(napi_hash_add);
4545
4546 /* Warning : caller is responsible to make sure rcu grace period
4547  * is respected before freeing memory containing @napi
4548  */
4549 void napi_hash_del(struct napi_struct *napi)
4550 {
4551         spin_lock(&napi_hash_lock);
4552
4553         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4554                 hlist_del_rcu(&napi->napi_hash_node);
4555
4556         spin_unlock(&napi_hash_lock);
4557 }
4558 EXPORT_SYMBOL_GPL(napi_hash_del);
4559
4560 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4561 {
4562         struct napi_struct *napi;
4563
4564         napi = container_of(timer, struct napi_struct, timer);
4565         if (napi->gro_list)
4566                 napi_schedule(napi);
4567
4568         return HRTIMER_NORESTART;
4569 }
4570
4571 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4572                     int (*poll)(struct napi_struct *, int), int weight)
4573 {
4574         INIT_LIST_HEAD(&napi->poll_list);
4575         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4576         napi->timer.function = napi_watchdog;
4577         napi->gro_count = 0;
4578         napi->gro_list = NULL;
4579         napi->skb = NULL;
4580         napi->poll = poll;
4581         if (weight > NAPI_POLL_WEIGHT)
4582                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4583                             weight, dev->name);
4584         napi->weight = weight;
4585         list_add(&napi->dev_list, &dev->napi_list);
4586         napi->dev = dev;
4587 #ifdef CONFIG_NETPOLL
4588         spin_lock_init(&napi->poll_lock);
4589         napi->poll_owner = -1;
4590 #endif
4591         set_bit(NAPI_STATE_SCHED, &napi->state);
4592 }
4593 EXPORT_SYMBOL(netif_napi_add);
4594
4595 void napi_disable(struct napi_struct *n)
4596 {
4597         might_sleep();
4598         set_bit(NAPI_STATE_DISABLE, &n->state);
4599
4600         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4601                 msleep(1);
4602
4603         hrtimer_cancel(&n->timer);
4604
4605         clear_bit(NAPI_STATE_DISABLE, &n->state);
4606 }
4607 EXPORT_SYMBOL(napi_disable);
4608
4609 void netif_napi_del(struct napi_struct *napi)
4610 {
4611         list_del_init(&napi->dev_list);
4612         napi_free_frags(napi);
4613
4614         kfree_skb_list(napi->gro_list);
4615         napi->gro_list = NULL;
4616         napi->gro_count = 0;
4617 }
4618 EXPORT_SYMBOL(netif_napi_del);
4619
4620 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4621 {
4622         void *have;
4623         int work, weight;
4624
4625         list_del_init(&n->poll_list);
4626
4627         have = netpoll_poll_lock(n);
4628
4629         weight = n->weight;
4630
4631         /* This NAPI_STATE_SCHED test is for avoiding a race
4632          * with netpoll's poll_napi().  Only the entity which
4633          * obtains the lock and sees NAPI_STATE_SCHED set will
4634          * actually make the ->poll() call.  Therefore we avoid
4635          * accidentally calling ->poll() when NAPI is not scheduled.
4636          */
4637         work = 0;
4638         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4639                 work = n->poll(n, weight);
4640                 trace_napi_poll(n);
4641         }
4642
4643         WARN_ON_ONCE(work > weight);
4644
4645         if (likely(work < weight))
4646                 goto out_unlock;
4647
4648         /* Drivers must not modify the NAPI state if they
4649          * consume the entire weight.  In such cases this code
4650          * still "owns" the NAPI instance and therefore can
4651          * move the instance around on the list at-will.
4652          */
4653         if (unlikely(napi_disable_pending(n))) {
4654                 napi_complete(n);
4655                 goto out_unlock;
4656         }
4657
4658         if (n->gro_list) {
4659                 /* flush too old packets
4660                  * If HZ < 1000, flush all packets.
4661                  */
4662                 napi_gro_flush(n, HZ >= 1000);
4663         }
4664
4665         /* Some drivers may have called napi_schedule
4666          * prior to exhausting their budget.
4667          */
4668         if (unlikely(!list_empty(&n->poll_list))) {
4669                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4670                              n->dev ? n->dev->name : "backlog");
4671                 goto out_unlock;
4672         }
4673
4674         list_add_tail(&n->poll_list, repoll);
4675
4676 out_unlock:
4677         netpoll_poll_unlock(have);
4678
4679         return work;
4680 }
4681
4682 static void net_rx_action(struct softirq_action *h)
4683 {
4684         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4685         unsigned long time_limit = jiffies + 2;
4686         int budget = netdev_budget;
4687         LIST_HEAD(list);
4688         LIST_HEAD(repoll);
4689
4690         local_irq_disable();
4691         list_splice_init(&sd->poll_list, &list);
4692         local_irq_enable();
4693
4694         for (;;) {
4695                 struct napi_struct *n;
4696
4697                 if (list_empty(&list)) {
4698                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4699                                 return;
4700                         break;
4701                 }
4702
4703                 n = list_first_entry(&list, struct napi_struct, poll_list);
4704                 budget -= napi_poll(n, &repoll);
4705
4706                 /* If softirq window is exhausted then punt.
4707                  * Allow this to run for 2 jiffies since which will allow
4708                  * an average latency of 1.5/HZ.
4709                  */
4710                 if (unlikely(budget <= 0 ||
4711                              time_after_eq(jiffies, time_limit))) {
4712                         sd->time_squeeze++;
4713                         break;
4714                 }
4715         }
4716
4717         local_irq_disable();
4718
4719         list_splice_tail_init(&sd->poll_list, &list);
4720         list_splice_tail(&repoll, &list);
4721         list_splice(&list, &sd->poll_list);
4722         if (!list_empty(&sd->poll_list))
4723                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4724
4725         net_rps_action_and_irq_enable(sd);
4726 }
4727
4728 struct netdev_adjacent {
4729         struct net_device *dev;
4730
4731         /* upper master flag, there can only be one master device per list */
4732         bool master;
4733
4734         /* counter for the number of times this device was added to us */
4735         u16 ref_nr;
4736
4737         /* private field for the users */
4738         void *private;
4739
4740         struct list_head list;
4741         struct rcu_head rcu;
4742 };
4743
4744 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4745                                                  struct net_device *adj_dev,
4746                                                  struct list_head *adj_list)
4747 {
4748         struct netdev_adjacent *adj;
4749
4750         list_for_each_entry(adj, adj_list, list) {
4751                 if (adj->dev == adj_dev)
4752                         return adj;
4753         }
4754         return NULL;
4755 }
4756
4757 /**
4758  * netdev_has_upper_dev - Check if device is linked to an upper device
4759  * @dev: device
4760  * @upper_dev: upper device to check
4761  *
4762  * Find out if a device is linked to specified upper device and return true
4763  * in case it is. Note that this checks only immediate upper device,
4764  * not through a complete stack of devices. The caller must hold the RTNL lock.
4765  */
4766 bool netdev_has_upper_dev(struct net_device *dev,
4767                           struct net_device *upper_dev)
4768 {
4769         ASSERT_RTNL();
4770
4771         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4772 }
4773 EXPORT_SYMBOL(netdev_has_upper_dev);
4774
4775 /**
4776  * netdev_has_any_upper_dev - Check if device is linked to some device
4777  * @dev: device
4778  *
4779  * Find out if a device is linked to an upper device and return true in case
4780  * it is. The caller must hold the RTNL lock.
4781  */
4782 static bool netdev_has_any_upper_dev(struct net_device *dev)
4783 {
4784         ASSERT_RTNL();
4785
4786         return !list_empty(&dev->all_adj_list.upper);
4787 }
4788
4789 /**
4790  * netdev_master_upper_dev_get - Get master upper device
4791  * @dev: device
4792  *
4793  * Find a master upper device and return pointer to it or NULL in case
4794  * it's not there. The caller must hold the RTNL lock.
4795  */
4796 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4797 {
4798         struct netdev_adjacent *upper;
4799
4800         ASSERT_RTNL();
4801
4802         if (list_empty(&dev->adj_list.upper))
4803                 return NULL;
4804
4805         upper = list_first_entry(&dev->adj_list.upper,
4806                                  struct netdev_adjacent, list);
4807         if (likely(upper->master))
4808                 return upper->dev;
4809         return NULL;
4810 }
4811 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4812
4813 void *netdev_adjacent_get_private(struct list_head *adj_list)
4814 {
4815         struct netdev_adjacent *adj;
4816
4817         adj = list_entry(adj_list, struct netdev_adjacent, list);
4818
4819         return adj->private;
4820 }
4821 EXPORT_SYMBOL(netdev_adjacent_get_private);
4822
4823 /**
4824  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4825  * @dev: device
4826  * @iter: list_head ** of the current position
4827  *
4828  * Gets the next device from the dev's upper list, starting from iter
4829  * position. The caller must hold RCU read lock.
4830  */
4831 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4832                                                  struct list_head **iter)
4833 {
4834         struct netdev_adjacent *upper;
4835
4836         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4837
4838         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4839
4840         if (&upper->list == &dev->adj_list.upper)
4841                 return NULL;
4842
4843         *iter = &upper->list;
4844
4845         return upper->dev;
4846 }
4847 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4848
4849 /**
4850  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4851  * @dev: device
4852  * @iter: list_head ** of the current position
4853  *
4854  * Gets the next device from the dev's upper list, starting from iter
4855  * position. The caller must hold RCU read lock.
4856  */
4857 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4858                                                      struct list_head **iter)
4859 {
4860         struct netdev_adjacent *upper;
4861
4862         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4863
4864         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4865
4866         if (&upper->list == &dev->all_adj_list.upper)
4867                 return NULL;
4868
4869         *iter = &upper->list;
4870
4871         return upper->dev;
4872 }
4873 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4874
4875 /**
4876  * netdev_lower_get_next_private - Get the next ->private from the
4877  *                                 lower neighbour list
4878  * @dev: device
4879  * @iter: list_head ** of the current position
4880  *
4881  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4882  * list, starting from iter position. The caller must hold either hold the
4883  * RTNL lock or its own locking that guarantees that the neighbour lower
4884  * list will remain unchainged.
4885  */
4886 void *netdev_lower_get_next_private(struct net_device *dev,
4887                                     struct list_head **iter)
4888 {
4889         struct netdev_adjacent *lower;
4890
4891         lower = list_entry(*iter, struct netdev_adjacent, list);
4892
4893         if (&lower->list == &dev->adj_list.lower)
4894                 return NULL;
4895
4896         *iter = lower->list.next;
4897
4898         return lower->private;
4899 }
4900 EXPORT_SYMBOL(netdev_lower_get_next_private);
4901
4902 /**
4903  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4904  *                                     lower neighbour list, RCU
4905  *                                     variant
4906  * @dev: device
4907  * @iter: list_head ** of the current position
4908  *
4909  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4910  * list, starting from iter position. The caller must hold RCU read lock.
4911  */
4912 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4913                                         struct list_head **iter)
4914 {
4915         struct netdev_adjacent *lower;
4916
4917         WARN_ON_ONCE(!rcu_read_lock_held());
4918
4919         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4920
4921         if (&lower->list == &dev->adj_list.lower)
4922                 return NULL;
4923
4924         *iter = &lower->list;
4925
4926         return lower->private;
4927 }
4928 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4929
4930 /**
4931  * netdev_lower_get_next - Get the next device from the lower neighbour
4932  *                         list
4933  * @dev: device
4934  * @iter: list_head ** of the current position
4935  *
4936  * Gets the next netdev_adjacent from the dev's lower neighbour
4937  * list, starting from iter position. The caller must hold RTNL lock or
4938  * its own locking that guarantees that the neighbour lower
4939  * list will remain unchainged.
4940  */
4941 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4942 {
4943         struct netdev_adjacent *lower;
4944
4945         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4946
4947         if (&lower->list == &dev->adj_list.lower)
4948                 return NULL;
4949
4950         *iter = &lower->list;
4951
4952         return lower->dev;
4953 }
4954 EXPORT_SYMBOL(netdev_lower_get_next);
4955
4956 /**
4957  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4958  *                                     lower neighbour list, RCU
4959  *                                     variant
4960  * @dev: device
4961  *
4962  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4963  * list. The caller must hold RCU read lock.
4964  */
4965 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4966 {
4967         struct netdev_adjacent *lower;
4968
4969         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4970                         struct netdev_adjacent, list);
4971         if (lower)
4972                 return lower->private;
4973         return NULL;
4974 }
4975 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4976
4977 /**
4978  * netdev_master_upper_dev_get_rcu - Get master upper device
4979  * @dev: device
4980  *
4981  * Find a master upper device and return pointer to it or NULL in case
4982  * it's not there. The caller must hold the RCU read lock.
4983  */
4984 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4985 {
4986         struct netdev_adjacent *upper;
4987
4988         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4989                                        struct netdev_adjacent, list);
4990         if (upper && likely(upper->master))
4991                 return upper->dev;
4992         return NULL;
4993 }
4994 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4995
4996 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4997                               struct net_device *adj_dev,
4998                               struct list_head *dev_list)
4999 {
5000         char linkname[IFNAMSIZ+7];
5001         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5002                 "upper_%s" : "lower_%s", adj_dev->name);
5003         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5004                                  linkname);
5005 }
5006 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5007                                char *name,
5008                                struct list_head *dev_list)
5009 {
5010         char linkname[IFNAMSIZ+7];
5011         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5012                 "upper_%s" : "lower_%s", name);
5013         sysfs_remove_link(&(dev->dev.kobj), linkname);
5014 }
5015
5016 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5017                                                  struct net_device *adj_dev,
5018                                                  struct list_head *dev_list)
5019 {
5020         return (dev_list == &dev->adj_list.upper ||
5021                 dev_list == &dev->adj_list.lower) &&
5022                 net_eq(dev_net(dev), dev_net(adj_dev));
5023 }
5024
5025 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5026                                         struct net_device *adj_dev,
5027                                         struct list_head *dev_list,
5028                                         void *private, bool master)
5029 {
5030         struct netdev_adjacent *adj;
5031         int ret;
5032
5033         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5034
5035         if (adj) {
5036                 adj->ref_nr++;
5037                 return 0;
5038         }
5039
5040         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5041         if (!adj)
5042                 return -ENOMEM;
5043
5044         adj->dev = adj_dev;
5045         adj->master = master;
5046         adj->ref_nr = 1;
5047         adj->private = private;
5048         dev_hold(adj_dev);
5049
5050         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5051                  adj_dev->name, dev->name, adj_dev->name);
5052
5053         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5054                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5055                 if (ret)
5056                         goto free_adj;
5057         }
5058
5059         /* Ensure that master link is always the first item in list. */
5060         if (master) {
5061                 ret = sysfs_create_link(&(dev->dev.kobj),
5062                                         &(adj_dev->dev.kobj), "master");
5063                 if (ret)
5064                         goto remove_symlinks;
5065
5066                 list_add_rcu(&adj->list, dev_list);
5067         } else {
5068                 list_add_tail_rcu(&adj->list, dev_list);
5069         }
5070
5071         return 0;
5072
5073 remove_symlinks:
5074         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5075                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5076 free_adj:
5077         kfree(adj);
5078         dev_put(adj_dev);
5079
5080         return ret;
5081 }
5082
5083 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5084                                          struct net_device *adj_dev,
5085                                          struct list_head *dev_list)
5086 {
5087         struct netdev_adjacent *adj;
5088
5089         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5090
5091         if (!adj) {
5092                 pr_err("tried to remove device %s from %s\n",
5093                        dev->name, adj_dev->name);
5094                 BUG();
5095         }
5096
5097         if (adj->ref_nr > 1) {
5098                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5099                          adj->ref_nr-1);
5100                 adj->ref_nr--;
5101                 return;
5102         }
5103
5104         if (adj->master)
5105                 sysfs_remove_link(&(dev->dev.kobj), "master");
5106
5107         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5108                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5109
5110         list_del_rcu(&adj->list);
5111         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5112                  adj_dev->name, dev->name, adj_dev->name);
5113         dev_put(adj_dev);
5114         kfree_rcu(adj, rcu);
5115 }
5116
5117 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5118                                             struct net_device *upper_dev,
5119                                             struct list_head *up_list,
5120                                             struct list_head *down_list,
5121                                             void *private, bool master)
5122 {
5123         int ret;
5124
5125         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5126                                            master);
5127         if (ret)
5128                 return ret;
5129
5130         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5131                                            false);
5132         if (ret) {
5133                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5134                 return ret;
5135         }
5136
5137         return 0;
5138 }
5139
5140 static int __netdev_adjacent_dev_link(struct net_device *dev,
5141                                       struct net_device *upper_dev)
5142 {
5143         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5144                                                 &dev->all_adj_list.upper,
5145                                                 &upper_dev->all_adj_list.lower,
5146                                                 NULL, false);
5147 }
5148
5149 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5150                                                struct net_device *upper_dev,
5151                                                struct list_head *up_list,
5152                                                struct list_head *down_list)
5153 {
5154         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5155         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5156 }
5157
5158 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5159                                          struct net_device *upper_dev)
5160 {
5161         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5162                                            &dev->all_adj_list.upper,
5163                                            &upper_dev->all_adj_list.lower);
5164 }
5165
5166 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5167                                                 struct net_device *upper_dev,
5168                                                 void *private, bool master)
5169 {
5170         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5171
5172         if (ret)
5173                 return ret;
5174
5175         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5176                                                &dev->adj_list.upper,
5177                                                &upper_dev->adj_list.lower,
5178                                                private, master);
5179         if (ret) {
5180                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5181                 return ret;
5182         }
5183
5184         return 0;
5185 }
5186
5187 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5188                                                    struct net_device *upper_dev)
5189 {
5190         __netdev_adjacent_dev_unlink(dev, upper_dev);
5191         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5192                                            &dev->adj_list.upper,
5193                                            &upper_dev->adj_list.lower);
5194 }
5195
5196 static int __netdev_upper_dev_link(struct net_device *dev,
5197                                    struct net_device *upper_dev, bool master,
5198                                    void *private)
5199 {
5200         struct netdev_adjacent *i, *j, *to_i, *to_j;
5201         int ret = 0;
5202
5203         ASSERT_RTNL();
5204
5205         if (dev == upper_dev)
5206                 return -EBUSY;
5207
5208         /* To prevent loops, check if dev is not upper device to upper_dev. */
5209         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5210                 return -EBUSY;
5211
5212         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5213                 return -EEXIST;
5214
5215         if (master && netdev_master_upper_dev_get(dev))
5216                 return -EBUSY;
5217
5218         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5219                                                    master);
5220         if (ret)
5221                 return ret;
5222
5223         /* Now that we linked these devs, make all the upper_dev's
5224          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5225          * versa, and don't forget the devices itself. All of these
5226          * links are non-neighbours.
5227          */
5228         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5229                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5230                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5231                                  i->dev->name, j->dev->name);
5232                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5233                         if (ret)
5234                                 goto rollback_mesh;
5235                 }
5236         }
5237
5238         /* add dev to every upper_dev's upper device */
5239         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5240                 pr_debug("linking %s's upper device %s with %s\n",
5241                          upper_dev->name, i->dev->name, dev->name);
5242                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5243                 if (ret)
5244                         goto rollback_upper_mesh;
5245         }
5246
5247         /* add upper_dev to every dev's lower device */
5248         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5249                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5250                          i->dev->name, upper_dev->name);
5251                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5252                 if (ret)
5253                         goto rollback_lower_mesh;
5254         }
5255
5256         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5257         return 0;
5258
5259 rollback_lower_mesh:
5260         to_i = i;
5261         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5262                 if (i == to_i)
5263                         break;
5264                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5265         }
5266
5267         i = NULL;
5268
5269 rollback_upper_mesh:
5270         to_i = i;
5271         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5272                 if (i == to_i)
5273                         break;
5274                 __netdev_adjacent_dev_unlink(dev, i->dev);
5275         }
5276
5277         i = j = NULL;
5278
5279 rollback_mesh:
5280         to_i = i;
5281         to_j = j;
5282         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5283                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5284                         if (i == to_i && j == to_j)
5285                                 break;
5286                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5287                 }
5288                 if (i == to_i)
5289                         break;
5290         }
5291
5292         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5293
5294         return ret;
5295 }
5296
5297 /**
5298  * netdev_upper_dev_link - Add a link to the upper device
5299  * @dev: device
5300  * @upper_dev: new upper device
5301  *
5302  * Adds a link to device which is upper to this one. The caller must hold
5303  * the RTNL lock. On a failure a negative errno code is returned.
5304  * On success the reference counts are adjusted and the function
5305  * returns zero.
5306  */
5307 int netdev_upper_dev_link(struct net_device *dev,
5308                           struct net_device *upper_dev)
5309 {
5310         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5311 }
5312 EXPORT_SYMBOL(netdev_upper_dev_link);
5313
5314 /**
5315  * netdev_master_upper_dev_link - Add a master link to the upper device
5316  * @dev: device
5317  * @upper_dev: new upper device
5318  *
5319  * Adds a link to device which is upper to this one. In this case, only
5320  * one master upper device can be linked, although other non-master devices
5321  * might be linked as well. The caller must hold the RTNL lock.
5322  * On a failure a negative errno code is returned. On success the reference
5323  * counts are adjusted and the function returns zero.
5324  */
5325 int netdev_master_upper_dev_link(struct net_device *dev,
5326                                  struct net_device *upper_dev)
5327 {
5328         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5329 }
5330 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5331
5332 int netdev_master_upper_dev_link_private(struct net_device *dev,
5333                                          struct net_device *upper_dev,
5334                                          void *private)
5335 {
5336         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5337 }
5338 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5339
5340 /**
5341  * netdev_upper_dev_unlink - Removes a link to upper device
5342  * @dev: device
5343  * @upper_dev: new upper device
5344  *
5345  * Removes a link to device which is upper to this one. The caller must hold
5346  * the RTNL lock.
5347  */
5348 void netdev_upper_dev_unlink(struct net_device *dev,
5349                              struct net_device *upper_dev)
5350 {
5351         struct netdev_adjacent *i, *j;
5352         ASSERT_RTNL();
5353
5354         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5355
5356         /* Here is the tricky part. We must remove all dev's lower
5357          * devices from all upper_dev's upper devices and vice
5358          * versa, to maintain the graph relationship.
5359          */
5360         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5361                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5362                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5363
5364         /* remove also the devices itself from lower/upper device
5365          * list
5366          */
5367         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5368                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5369
5370         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5371                 __netdev_adjacent_dev_unlink(dev, i->dev);
5372
5373         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5374 }
5375 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5376
5377 /**
5378  * netdev_bonding_info_change - Dispatch event about slave change
5379  * @dev: device
5380  * @bonding_info: info to dispatch
5381  *
5382  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5383  * The caller must hold the RTNL lock.
5384  */
5385 void netdev_bonding_info_change(struct net_device *dev,
5386                                 struct netdev_bonding_info *bonding_info)
5387 {
5388         struct netdev_notifier_bonding_info     info;
5389
5390         memcpy(&info.bonding_info, bonding_info,
5391                sizeof(struct netdev_bonding_info));
5392         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5393                                       &info.info);
5394 }
5395 EXPORT_SYMBOL(netdev_bonding_info_change);
5396
5397 static void netdev_adjacent_add_links(struct net_device *dev)
5398 {
5399         struct netdev_adjacent *iter;
5400
5401         struct net *net = dev_net(dev);
5402
5403         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5404                 if (!net_eq(net,dev_net(iter->dev)))
5405                         continue;
5406                 netdev_adjacent_sysfs_add(iter->dev, dev,
5407                                           &iter->dev->adj_list.lower);
5408                 netdev_adjacent_sysfs_add(dev, iter->dev,
5409                                           &dev->adj_list.upper);
5410         }
5411
5412         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5413                 if (!net_eq(net,dev_net(iter->dev)))
5414                         continue;
5415                 netdev_adjacent_sysfs_add(iter->dev, dev,
5416                                           &iter->dev->adj_list.upper);
5417                 netdev_adjacent_sysfs_add(dev, iter->dev,
5418                                           &dev->adj_list.lower);
5419         }
5420 }
5421
5422 static void netdev_adjacent_del_links(struct net_device *dev)
5423 {
5424         struct netdev_adjacent *iter;
5425
5426         struct net *net = dev_net(dev);
5427
5428         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5429                 if (!net_eq(net,dev_net(iter->dev)))
5430                         continue;
5431                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5432                                           &iter->dev->adj_list.lower);
5433                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5434                                           &dev->adj_list.upper);
5435         }
5436
5437         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5438                 if (!net_eq(net,dev_net(iter->dev)))
5439                         continue;
5440                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5441                                           &iter->dev->adj_list.upper);
5442                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5443                                           &dev->adj_list.lower);
5444         }
5445 }
5446
5447 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5448 {
5449         struct netdev_adjacent *iter;
5450
5451         struct net *net = dev_net(dev);
5452
5453         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5454                 if (!net_eq(net,dev_net(iter->dev)))
5455                         continue;
5456                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5457                                           &iter->dev->adj_list.lower);
5458                 netdev_adjacent_sysfs_add(iter->dev, dev,
5459                                           &iter->dev->adj_list.lower);
5460         }
5461
5462         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5463                 if (!net_eq(net,dev_net(iter->dev)))
5464                         continue;
5465                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5466                                           &iter->dev->adj_list.upper);
5467                 netdev_adjacent_sysfs_add(iter->dev, dev,
5468                                           &iter->dev->adj_list.upper);
5469         }
5470 }
5471
5472 void *netdev_lower_dev_get_private(struct net_device *dev,
5473                                    struct net_device *lower_dev)
5474 {
5475         struct netdev_adjacent *lower;
5476
5477         if (!lower_dev)
5478                 return NULL;
5479         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5480         if (!lower)
5481                 return NULL;
5482
5483         return lower->private;
5484 }
5485 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5486
5487
5488 int dev_get_nest_level(struct net_device *dev,
5489                        bool (*type_check)(struct net_device *dev))
5490 {
5491         struct net_device *lower = NULL;
5492         struct list_head *iter;
5493         int max_nest = -1;
5494         int nest;
5495
5496         ASSERT_RTNL();
5497
5498         netdev_for_each_lower_dev(dev, lower, iter) {
5499                 nest = dev_get_nest_level(lower, type_check);
5500                 if (max_nest < nest)
5501                         max_nest = nest;
5502         }
5503
5504         if (type_check(dev))
5505                 max_nest++;
5506
5507         return max_nest;
5508 }
5509 EXPORT_SYMBOL(dev_get_nest_level);
5510
5511 static void dev_change_rx_flags(struct net_device *dev, int flags)
5512 {
5513         const struct net_device_ops *ops = dev->netdev_ops;
5514
5515         if (ops->ndo_change_rx_flags)
5516                 ops->ndo_change_rx_flags(dev, flags);
5517 }
5518
5519 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5520 {
5521         unsigned int old_flags = dev->flags;
5522         kuid_t uid;
5523         kgid_t gid;
5524
5525         ASSERT_RTNL();
5526
5527         dev->flags |= IFF_PROMISC;
5528         dev->promiscuity += inc;
5529         if (dev->promiscuity == 0) {
5530                 /*
5531                  * Avoid overflow.
5532                  * If inc causes overflow, untouch promisc and return error.
5533                  */
5534                 if (inc < 0)
5535                         dev->flags &= ~IFF_PROMISC;
5536                 else {
5537                         dev->promiscuity -= inc;
5538                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5539                                 dev->name);
5540                         return -EOVERFLOW;
5541                 }
5542         }
5543         if (dev->flags != old_flags) {
5544                 pr_info("device %s %s promiscuous mode\n",
5545                         dev->name,
5546                         dev->flags & IFF_PROMISC ? "entered" : "left");
5547                 if (audit_enabled) {
5548                         current_uid_gid(&uid, &gid);
5549                         audit_log(current->audit_context, GFP_ATOMIC,
5550                                 AUDIT_ANOM_PROMISCUOUS,
5551                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5552                                 dev->name, (dev->flags & IFF_PROMISC),
5553                                 (old_flags & IFF_PROMISC),
5554                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5555                                 from_kuid(&init_user_ns, uid),
5556                                 from_kgid(&init_user_ns, gid),
5557                                 audit_get_sessionid(current));
5558                 }
5559
5560                 dev_change_rx_flags(dev, IFF_PROMISC);
5561         }
5562         if (notify)
5563                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5564         return 0;
5565 }
5566
5567 /**
5568  *      dev_set_promiscuity     - update promiscuity count on a device
5569  *      @dev: device
5570  *      @inc: modifier
5571  *
5572  *      Add or remove promiscuity from a device. While the count in the device
5573  *      remains above zero the interface remains promiscuous. Once it hits zero
5574  *      the device reverts back to normal filtering operation. A negative inc
5575  *      value is used to drop promiscuity on the device.
5576  *      Return 0 if successful or a negative errno code on error.
5577  */
5578 int dev_set_promiscuity(struct net_device *dev, int inc)
5579 {
5580         unsigned int old_flags = dev->flags;
5581         int err;
5582
5583         err = __dev_set_promiscuity(dev, inc, true);
5584         if (err < 0)
5585                 return err;
5586         if (dev->flags != old_flags)
5587                 dev_set_rx_mode(dev);
5588         return err;
5589 }
5590 EXPORT_SYMBOL(dev_set_promiscuity);
5591
5592 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5593 {
5594         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5595
5596         ASSERT_RTNL();
5597
5598         dev->flags |= IFF_ALLMULTI;
5599         dev->allmulti += inc;
5600         if (dev->allmulti == 0) {
5601                 /*
5602                  * Avoid overflow.
5603                  * If inc causes overflow, untouch allmulti and return error.
5604                  */
5605                 if (inc < 0)
5606                         dev->flags &= ~IFF_ALLMULTI;
5607                 else {
5608                         dev->allmulti -= inc;
5609                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5610                                 dev->name);
5611                         return -EOVERFLOW;
5612                 }
5613         }
5614         if (dev->flags ^ old_flags) {
5615                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5616                 dev_set_rx_mode(dev);
5617                 if (notify)
5618                         __dev_notify_flags(dev, old_flags,
5619                                            dev->gflags ^ old_gflags);
5620         }
5621         return 0;
5622 }
5623
5624 /**
5625  *      dev_set_allmulti        - update allmulti count on a device
5626  *      @dev: device
5627  *      @inc: modifier
5628  *
5629  *      Add or remove reception of all multicast frames to a device. While the
5630  *      count in the device remains above zero the interface remains listening
5631  *      to all interfaces. Once it hits zero the device reverts back to normal
5632  *      filtering operation. A negative @inc value is used to drop the counter
5633  *      when releasing a resource needing all multicasts.
5634  *      Return 0 if successful or a negative errno code on error.
5635  */
5636
5637 int dev_set_allmulti(struct net_device *dev, int inc)
5638 {
5639         return __dev_set_allmulti(dev, inc, true);
5640 }
5641 EXPORT_SYMBOL(dev_set_allmulti);
5642
5643 /*
5644  *      Upload unicast and multicast address lists to device and
5645  *      configure RX filtering. When the device doesn't support unicast
5646  *      filtering it is put in promiscuous mode while unicast addresses
5647  *      are present.
5648  */
5649 void __dev_set_rx_mode(struct net_device *dev)
5650 {
5651         const struct net_device_ops *ops = dev->netdev_ops;
5652
5653         /* dev_open will call this function so the list will stay sane. */
5654         if (!(dev->flags&IFF_UP))
5655                 return;
5656
5657         if (!netif_device_present(dev))
5658                 return;
5659
5660         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5661                 /* Unicast addresses changes may only happen under the rtnl,
5662                  * therefore calling __dev_set_promiscuity here is safe.
5663                  */
5664                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5665                         __dev_set_promiscuity(dev, 1, false);
5666                         dev->uc_promisc = true;
5667                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5668                         __dev_set_promiscuity(dev, -1, false);
5669                         dev->uc_promisc = false;
5670                 }
5671         }
5672
5673         if (ops->ndo_set_rx_mode)
5674                 ops->ndo_set_rx_mode(dev);
5675 }
5676
5677 void dev_set_rx_mode(struct net_device *dev)
5678 {
5679         netif_addr_lock_bh(dev);
5680         __dev_set_rx_mode(dev);
5681         netif_addr_unlock_bh(dev);
5682 }
5683
5684 /**
5685  *      dev_get_flags - get flags reported to userspace
5686  *      @dev: device
5687  *
5688  *      Get the combination of flag bits exported through APIs to userspace.
5689  */
5690 unsigned int dev_get_flags(const struct net_device *dev)
5691 {
5692         unsigned int flags;
5693
5694         flags = (dev->flags & ~(IFF_PROMISC |
5695                                 IFF_ALLMULTI |
5696                                 IFF_RUNNING |
5697                                 IFF_LOWER_UP |
5698                                 IFF_DORMANT)) |
5699                 (dev->gflags & (IFF_PROMISC |
5700                                 IFF_ALLMULTI));
5701
5702         if (netif_running(dev)) {
5703                 if (netif_oper_up(dev))
5704                         flags |= IFF_RUNNING;
5705                 if (netif_carrier_ok(dev))
5706                         flags |= IFF_LOWER_UP;
5707                 if (netif_dormant(dev))
5708                         flags |= IFF_DORMANT;
5709         }
5710
5711         return flags;
5712 }
5713 EXPORT_SYMBOL(dev_get_flags);
5714
5715 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5716 {
5717         unsigned int old_flags = dev->flags;
5718         int ret;
5719
5720         ASSERT_RTNL();
5721
5722         /*
5723          *      Set the flags on our device.
5724          */
5725
5726         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5727                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5728                                IFF_AUTOMEDIA)) |
5729                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5730                                     IFF_ALLMULTI));
5731
5732         /*
5733          *      Load in the correct multicast list now the flags have changed.
5734          */
5735
5736         if ((old_flags ^ flags) & IFF_MULTICAST)
5737                 dev_change_rx_flags(dev, IFF_MULTICAST);
5738
5739         dev_set_rx_mode(dev);
5740
5741         /*
5742          *      Have we downed the interface. We handle IFF_UP ourselves
5743          *      according to user attempts to set it, rather than blindly
5744          *      setting it.
5745          */
5746
5747         ret = 0;
5748         if ((old_flags ^ flags) & IFF_UP)
5749                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5750
5751         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5752                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5753                 unsigned int old_flags = dev->flags;
5754
5755                 dev->gflags ^= IFF_PROMISC;
5756
5757                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5758                         if (dev->flags != old_flags)
5759                                 dev_set_rx_mode(dev);
5760         }
5761
5762         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5763            is important. Some (broken) drivers set IFF_PROMISC, when
5764            IFF_ALLMULTI is requested not asking us and not reporting.
5765          */
5766         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5767                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5768
5769                 dev->gflags ^= IFF_ALLMULTI;
5770                 __dev_set_allmulti(dev, inc, false);
5771         }
5772
5773         return ret;
5774 }
5775
5776 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5777                         unsigned int gchanges)
5778 {
5779         unsigned int changes = dev->flags ^ old_flags;
5780
5781         if (gchanges)
5782                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5783
5784         if (changes & IFF_UP) {
5785                 if (dev->flags & IFF_UP)
5786                         call_netdevice_notifiers(NETDEV_UP, dev);
5787                 else
5788                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5789         }
5790
5791         if (dev->flags & IFF_UP &&
5792             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5793                 struct netdev_notifier_change_info change_info;
5794
5795                 change_info.flags_changed = changes;
5796                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5797                                               &change_info.info);
5798         }
5799 }
5800
5801 /**
5802  *      dev_change_flags - change device settings
5803  *      @dev: device
5804  *      @flags: device state flags
5805  *
5806  *      Change settings on device based state flags. The flags are
5807  *      in the userspace exported format.
5808  */
5809 int dev_change_flags(struct net_device *dev, unsigned int flags)
5810 {
5811         int ret;
5812         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5813
5814         ret = __dev_change_flags(dev, flags);
5815         if (ret < 0)
5816                 return ret;
5817
5818         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5819         __dev_notify_flags(dev, old_flags, changes);
5820         return ret;
5821 }
5822 EXPORT_SYMBOL(dev_change_flags);
5823
5824 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5825 {
5826         const struct net_device_ops *ops = dev->netdev_ops;
5827
5828         if (ops->ndo_change_mtu)
5829                 return ops->ndo_change_mtu(dev, new_mtu);
5830
5831         dev->mtu = new_mtu;
5832         return 0;
5833 }
5834
5835 /**
5836  *      dev_set_mtu - Change maximum transfer unit
5837  *      @dev: device
5838  *      @new_mtu: new transfer unit
5839  *
5840  *      Change the maximum transfer size of the network device.
5841  */
5842 int dev_set_mtu(struct net_device *dev, int new_mtu)
5843 {
5844         int err, orig_mtu;
5845
5846         if (new_mtu == dev->mtu)
5847                 return 0;
5848
5849         /*      MTU must be positive.    */
5850         if (new_mtu < 0)
5851                 return -EINVAL;
5852
5853         if (!netif_device_present(dev))
5854                 return -ENODEV;
5855
5856         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5857         err = notifier_to_errno(err);
5858         if (err)
5859                 return err;
5860
5861         orig_mtu = dev->mtu;
5862         err = __dev_set_mtu(dev, new_mtu);
5863
5864         if (!err) {
5865                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5866                 err = notifier_to_errno(err);
5867                 if (err) {
5868                         /* setting mtu back and notifying everyone again,
5869                          * so that they have a chance to revert changes.
5870                          */
5871                         __dev_set_mtu(dev, orig_mtu);
5872                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5873                 }
5874         }
5875         return err;
5876 }
5877 EXPORT_SYMBOL(dev_set_mtu);
5878
5879 /**
5880  *      dev_set_group - Change group this device belongs to
5881  *      @dev: device
5882  *      @new_group: group this device should belong to
5883  */
5884 void dev_set_group(struct net_device *dev, int new_group)
5885 {
5886         dev->group = new_group;
5887 }
5888 EXPORT_SYMBOL(dev_set_group);
5889
5890 /**
5891  *      dev_set_mac_address - Change Media Access Control Address
5892  *      @dev: device
5893  *      @sa: new address
5894  *
5895  *      Change the hardware (MAC) address of the device
5896  */
5897 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5898 {
5899         const struct net_device_ops *ops = dev->netdev_ops;
5900         int err;
5901
5902         if (!ops->ndo_set_mac_address)
5903                 return -EOPNOTSUPP;
5904         if (sa->sa_family != dev->type)
5905                 return -EINVAL;
5906         if (!netif_device_present(dev))
5907                 return -ENODEV;
5908         err = ops->ndo_set_mac_address(dev, sa);
5909         if (err)
5910                 return err;
5911         dev->addr_assign_type = NET_ADDR_SET;
5912         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5913         add_device_randomness(dev->dev_addr, dev->addr_len);
5914         return 0;
5915 }
5916 EXPORT_SYMBOL(dev_set_mac_address);
5917
5918 /**
5919  *      dev_change_carrier - Change device carrier
5920  *      @dev: device
5921  *      @new_carrier: new value
5922  *
5923  *      Change device carrier
5924  */
5925 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5926 {
5927         const struct net_device_ops *ops = dev->netdev_ops;
5928
5929         if (!ops->ndo_change_carrier)
5930                 return -EOPNOTSUPP;
5931         if (!netif_device_present(dev))
5932                 return -ENODEV;
5933         return ops->ndo_change_carrier(dev, new_carrier);
5934 }
5935 EXPORT_SYMBOL(dev_change_carrier);
5936
5937 /**
5938  *      dev_get_phys_port_id - Get device physical port ID
5939  *      @dev: device
5940  *      @ppid: port ID
5941  *
5942  *      Get device physical port ID
5943  */
5944 int dev_get_phys_port_id(struct net_device *dev,
5945                          struct netdev_phys_item_id *ppid)
5946 {
5947         const struct net_device_ops *ops = dev->netdev_ops;
5948
5949         if (!ops->ndo_get_phys_port_id)
5950                 return -EOPNOTSUPP;
5951         return ops->ndo_get_phys_port_id(dev, ppid);
5952 }
5953 EXPORT_SYMBOL(dev_get_phys_port_id);
5954
5955 /**
5956  *      dev_get_phys_port_name - Get device physical port name
5957  *      @dev: device
5958  *      @name: port name
5959  *
5960  *      Get device physical port name
5961  */
5962 int dev_get_phys_port_name(struct net_device *dev,
5963                            char *name, size_t len)
5964 {
5965         const struct net_device_ops *ops = dev->netdev_ops;
5966
5967         if (!ops->ndo_get_phys_port_name)
5968                 return -EOPNOTSUPP;
5969         return ops->ndo_get_phys_port_name(dev, name, len);
5970 }
5971 EXPORT_SYMBOL(dev_get_phys_port_name);
5972
5973 /**
5974  *      dev_new_index   -       allocate an ifindex
5975  *      @net: the applicable net namespace
5976  *
5977  *      Returns a suitable unique value for a new device interface
5978  *      number.  The caller must hold the rtnl semaphore or the
5979  *      dev_base_lock to be sure it remains unique.
5980  */
5981 static int dev_new_index(struct net *net)
5982 {
5983         int ifindex = net->ifindex;
5984         for (;;) {
5985                 if (++ifindex <= 0)
5986                         ifindex = 1;
5987                 if (!__dev_get_by_index(net, ifindex))
5988                         return net->ifindex = ifindex;
5989         }
5990 }
5991
5992 /* Delayed registration/unregisteration */
5993 static LIST_HEAD(net_todo_list);
5994 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5995
5996 static void net_set_todo(struct net_device *dev)
5997 {
5998         list_add_tail(&dev->todo_list, &net_todo_list);
5999         dev_net(dev)->dev_unreg_count++;
6000 }
6001
6002 static void rollback_registered_many(struct list_head *head)
6003 {
6004         struct net_device *dev, *tmp;
6005         LIST_HEAD(close_head);
6006
6007         BUG_ON(dev_boot_phase);
6008         ASSERT_RTNL();
6009
6010         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6011                 /* Some devices call without registering
6012                  * for initialization unwind. Remove those
6013                  * devices and proceed with the remaining.
6014                  */
6015                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6016                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6017                                  dev->name, dev);
6018
6019                         WARN_ON(1);
6020                         list_del(&dev->unreg_list);
6021                         continue;
6022                 }
6023                 dev->dismantle = true;
6024                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6025         }
6026
6027         /* If device is running, close it first. */
6028         list_for_each_entry(dev, head, unreg_list)
6029                 list_add_tail(&dev->close_list, &close_head);
6030         dev_close_many(&close_head, true);
6031
6032         list_for_each_entry(dev, head, unreg_list) {
6033                 /* And unlink it from device chain. */
6034                 unlist_netdevice(dev);
6035
6036                 dev->reg_state = NETREG_UNREGISTERING;
6037         }
6038
6039         synchronize_net();
6040
6041         list_for_each_entry(dev, head, unreg_list) {
6042                 struct sk_buff *skb = NULL;
6043
6044                 /* Shutdown queueing discipline. */
6045                 dev_shutdown(dev);
6046
6047
6048                 /* Notify protocols, that we are about to destroy
6049                    this device. They should clean all the things.
6050                 */
6051                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6052
6053                 if (!dev->rtnl_link_ops ||
6054                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6055                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6056                                                      GFP_KERNEL);
6057
6058                 /*
6059                  *      Flush the unicast and multicast chains
6060                  */
6061                 dev_uc_flush(dev);
6062                 dev_mc_flush(dev);
6063
6064                 if (dev->netdev_ops->ndo_uninit)
6065                         dev->netdev_ops->ndo_uninit(dev);
6066
6067                 if (skb)
6068                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6069
6070                 /* Notifier chain MUST detach us all upper devices. */
6071                 WARN_ON(netdev_has_any_upper_dev(dev));
6072
6073                 /* Remove entries from kobject tree */
6074                 netdev_unregister_kobject(dev);
6075 #ifdef CONFIG_XPS
6076                 /* Remove XPS queueing entries */
6077                 netif_reset_xps_queues_gt(dev, 0);
6078 #endif
6079         }
6080
6081         synchronize_net();
6082
6083         list_for_each_entry(dev, head, unreg_list)
6084                 dev_put(dev);
6085 }
6086
6087 static void rollback_registered(struct net_device *dev)
6088 {
6089         LIST_HEAD(single);
6090
6091         list_add(&dev->unreg_list, &single);
6092         rollback_registered_many(&single);
6093         list_del(&single);
6094 }
6095
6096 static netdev_features_t netdev_fix_features(struct net_device *dev,
6097         netdev_features_t features)
6098 {
6099         /* Fix illegal checksum combinations */
6100         if ((features & NETIF_F_HW_CSUM) &&
6101             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6102                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6103                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6104         }
6105
6106         /* TSO requires that SG is present as well. */
6107         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6108                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6109                 features &= ~NETIF_F_ALL_TSO;
6110         }
6111
6112         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6113                                         !(features & NETIF_F_IP_CSUM)) {
6114                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6115                 features &= ~NETIF_F_TSO;
6116                 features &= ~NETIF_F_TSO_ECN;
6117         }
6118
6119         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6120                                          !(features & NETIF_F_IPV6_CSUM)) {
6121                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6122                 features &= ~NETIF_F_TSO6;
6123         }
6124
6125         /* TSO ECN requires that TSO is present as well. */
6126         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6127                 features &= ~NETIF_F_TSO_ECN;
6128
6129         /* Software GSO depends on SG. */
6130         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6131                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6132                 features &= ~NETIF_F_GSO;
6133         }
6134
6135         /* UFO needs SG and checksumming */
6136         if (features & NETIF_F_UFO) {
6137                 /* maybe split UFO into V4 and V6? */
6138                 if (!((features & NETIF_F_GEN_CSUM) ||
6139                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6140                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6141                         netdev_dbg(dev,
6142                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6143                         features &= ~NETIF_F_UFO;
6144                 }
6145
6146                 if (!(features & NETIF_F_SG)) {
6147                         netdev_dbg(dev,
6148                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6149                         features &= ~NETIF_F_UFO;
6150                 }
6151         }
6152
6153 #ifdef CONFIG_NET_RX_BUSY_POLL
6154         if (dev->netdev_ops->ndo_busy_poll)
6155                 features |= NETIF_F_BUSY_POLL;
6156         else
6157 #endif
6158                 features &= ~NETIF_F_BUSY_POLL;
6159
6160         return features;
6161 }
6162
6163 int __netdev_update_features(struct net_device *dev)
6164 {
6165         netdev_features_t features;
6166         int err = 0;
6167
6168         ASSERT_RTNL();
6169
6170         features = netdev_get_wanted_features(dev);
6171
6172         if (dev->netdev_ops->ndo_fix_features)
6173                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6174
6175         /* driver might be less strict about feature dependencies */
6176         features = netdev_fix_features(dev, features);
6177
6178         if (dev->features == features)
6179                 return 0;
6180
6181         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6182                 &dev->features, &features);
6183
6184         if (dev->netdev_ops->ndo_set_features)
6185                 err = dev->netdev_ops->ndo_set_features(dev, features);
6186
6187         if (unlikely(err < 0)) {
6188                 netdev_err(dev,
6189                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6190                         err, &features, &dev->features);
6191                 return -1;
6192         }
6193
6194         if (!err)
6195                 dev->features = features;
6196
6197         return 1;
6198 }
6199
6200 /**
6201  *      netdev_update_features - recalculate device features
6202  *      @dev: the device to check
6203  *
6204  *      Recalculate dev->features set and send notifications if it
6205  *      has changed. Should be called after driver or hardware dependent
6206  *      conditions might have changed that influence the features.
6207  */
6208 void netdev_update_features(struct net_device *dev)
6209 {
6210         if (__netdev_update_features(dev))
6211                 netdev_features_change(dev);
6212 }
6213 EXPORT_SYMBOL(netdev_update_features);
6214
6215 /**
6216  *      netdev_change_features - recalculate device features
6217  *      @dev: the device to check
6218  *
6219  *      Recalculate dev->features set and send notifications even
6220  *      if they have not changed. Should be called instead of
6221  *      netdev_update_features() if also dev->vlan_features might
6222  *      have changed to allow the changes to be propagated to stacked
6223  *      VLAN devices.
6224  */
6225 void netdev_change_features(struct net_device *dev)
6226 {
6227         __netdev_update_features(dev);
6228         netdev_features_change(dev);
6229 }
6230 EXPORT_SYMBOL(netdev_change_features);
6231
6232 /**
6233  *      netif_stacked_transfer_operstate -      transfer operstate
6234  *      @rootdev: the root or lower level device to transfer state from
6235  *      @dev: the device to transfer operstate to
6236  *
6237  *      Transfer operational state from root to device. This is normally
6238  *      called when a stacking relationship exists between the root
6239  *      device and the device(a leaf device).
6240  */
6241 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6242                                         struct net_device *dev)
6243 {
6244         if (rootdev->operstate == IF_OPER_DORMANT)
6245                 netif_dormant_on(dev);
6246         else
6247                 netif_dormant_off(dev);
6248
6249         if (netif_carrier_ok(rootdev)) {
6250                 if (!netif_carrier_ok(dev))
6251                         netif_carrier_on(dev);
6252         } else {
6253                 if (netif_carrier_ok(dev))
6254                         netif_carrier_off(dev);
6255         }
6256 }
6257 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6258
6259 #ifdef CONFIG_SYSFS
6260 static int netif_alloc_rx_queues(struct net_device *dev)
6261 {
6262         unsigned int i, count = dev->num_rx_queues;
6263         struct netdev_rx_queue *rx;
6264         size_t sz = count * sizeof(*rx);
6265
6266         BUG_ON(count < 1);
6267
6268         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6269         if (!rx) {
6270                 rx = vzalloc(sz);
6271                 if (!rx)
6272                         return -ENOMEM;
6273         }
6274         dev->_rx = rx;
6275
6276         for (i = 0; i < count; i++)
6277                 rx[i].dev = dev;
6278         return 0;
6279 }
6280 #endif
6281
6282 static void netdev_init_one_queue(struct net_device *dev,
6283                                   struct netdev_queue *queue, void *_unused)
6284 {
6285         /* Initialize queue lock */
6286         spin_lock_init(&queue->_xmit_lock);
6287         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6288         queue->xmit_lock_owner = -1;
6289         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6290         queue->dev = dev;
6291 #ifdef CONFIG_BQL
6292         dql_init(&queue->dql, HZ);
6293 #endif
6294 }
6295
6296 static void netif_free_tx_queues(struct net_device *dev)
6297 {
6298         kvfree(dev->_tx);
6299 }
6300
6301 static int netif_alloc_netdev_queues(struct net_device *dev)
6302 {
6303         unsigned int count = dev->num_tx_queues;
6304         struct netdev_queue *tx;
6305         size_t sz = count * sizeof(*tx);
6306
6307         BUG_ON(count < 1 || count > 0xffff);
6308
6309         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6310         if (!tx) {
6311                 tx = vzalloc(sz);
6312                 if (!tx)
6313                         return -ENOMEM;
6314         }
6315         dev->_tx = tx;
6316
6317         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6318         spin_lock_init(&dev->tx_global_lock);
6319
6320         return 0;
6321 }
6322
6323 /**
6324  *      register_netdevice      - register a network device
6325  *      @dev: device to register
6326  *
6327  *      Take a completed network device structure and add it to the kernel
6328  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6329  *      chain. 0 is returned on success. A negative errno code is returned
6330  *      on a failure to set up the device, or if the name is a duplicate.
6331  *
6332  *      Callers must hold the rtnl semaphore. You may want
6333  *      register_netdev() instead of this.
6334  *
6335  *      BUGS:
6336  *      The locking appears insufficient to guarantee two parallel registers
6337  *      will not get the same name.
6338  */
6339
6340 int register_netdevice(struct net_device *dev)
6341 {
6342         int ret;
6343         struct net *net = dev_net(dev);
6344
6345         BUG_ON(dev_boot_phase);
6346         ASSERT_RTNL();
6347
6348         might_sleep();
6349
6350         /* When net_device's are persistent, this will be fatal. */
6351         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6352         BUG_ON(!net);
6353
6354         spin_lock_init(&dev->addr_list_lock);
6355         netdev_set_addr_lockdep_class(dev);
6356
6357         ret = dev_get_valid_name(net, dev, dev->name);
6358         if (ret < 0)
6359                 goto out;
6360
6361         /* Init, if this function is available */
6362         if (dev->netdev_ops->ndo_init) {
6363                 ret = dev->netdev_ops->ndo_init(dev);
6364                 if (ret) {
6365                         if (ret > 0)
6366                                 ret = -EIO;
6367                         goto out;
6368                 }
6369         }
6370
6371         if (((dev->hw_features | dev->features) &
6372              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6373             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6374              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6375                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6376                 ret = -EINVAL;
6377                 goto err_uninit;
6378         }
6379
6380         ret = -EBUSY;
6381         if (!dev->ifindex)
6382                 dev->ifindex = dev_new_index(net);
6383         else if (__dev_get_by_index(net, dev->ifindex))
6384                 goto err_uninit;
6385
6386         /* Transfer changeable features to wanted_features and enable
6387          * software offloads (GSO and GRO).
6388          */
6389         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6390         dev->features |= NETIF_F_SOFT_FEATURES;
6391         dev->wanted_features = dev->features & dev->hw_features;
6392
6393         if (!(dev->flags & IFF_LOOPBACK)) {
6394                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6395         }
6396
6397         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6398          */
6399         dev->vlan_features |= NETIF_F_HIGHDMA;
6400
6401         /* Make NETIF_F_SG inheritable to tunnel devices.
6402          */
6403         dev->hw_enc_features |= NETIF_F_SG;
6404
6405         /* Make NETIF_F_SG inheritable to MPLS.
6406          */
6407         dev->mpls_features |= NETIF_F_SG;
6408
6409         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6410         ret = notifier_to_errno(ret);
6411         if (ret)
6412                 goto err_uninit;
6413
6414         ret = netdev_register_kobject(dev);
6415         if (ret)
6416                 goto err_uninit;
6417         dev->reg_state = NETREG_REGISTERED;
6418
6419         __netdev_update_features(dev);
6420
6421         /*
6422          *      Default initial state at registry is that the
6423          *      device is present.
6424          */
6425
6426         set_bit(__LINK_STATE_PRESENT, &dev->state);
6427
6428         linkwatch_init_dev(dev);
6429
6430         dev_init_scheduler(dev);
6431         dev_hold(dev);
6432         list_netdevice(dev);
6433         add_device_randomness(dev->dev_addr, dev->addr_len);
6434
6435         /* If the device has permanent device address, driver should
6436          * set dev_addr and also addr_assign_type should be set to
6437          * NET_ADDR_PERM (default value).
6438          */
6439         if (dev->addr_assign_type == NET_ADDR_PERM)
6440                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6441
6442         /* Notify protocols, that a new device appeared. */
6443         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6444         ret = notifier_to_errno(ret);
6445         if (ret) {
6446                 rollback_registered(dev);
6447                 dev->reg_state = NETREG_UNREGISTERED;
6448         }
6449         /*
6450          *      Prevent userspace races by waiting until the network
6451          *      device is fully setup before sending notifications.
6452          */
6453         if (!dev->rtnl_link_ops ||
6454             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6455                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6456
6457 out:
6458         return ret;
6459
6460 err_uninit:
6461         if (dev->netdev_ops->ndo_uninit)
6462                 dev->netdev_ops->ndo_uninit(dev);
6463         goto out;
6464 }
6465 EXPORT_SYMBOL(register_netdevice);
6466
6467 /**
6468  *      init_dummy_netdev       - init a dummy network device for NAPI
6469  *      @dev: device to init
6470  *
6471  *      This takes a network device structure and initialize the minimum
6472  *      amount of fields so it can be used to schedule NAPI polls without
6473  *      registering a full blown interface. This is to be used by drivers
6474  *      that need to tie several hardware interfaces to a single NAPI
6475  *      poll scheduler due to HW limitations.
6476  */
6477 int init_dummy_netdev(struct net_device *dev)
6478 {
6479         /* Clear everything. Note we don't initialize spinlocks
6480          * are they aren't supposed to be taken by any of the
6481          * NAPI code and this dummy netdev is supposed to be
6482          * only ever used for NAPI polls
6483          */
6484         memset(dev, 0, sizeof(struct net_device));
6485
6486         /* make sure we BUG if trying to hit standard
6487          * register/unregister code path
6488          */
6489         dev->reg_state = NETREG_DUMMY;
6490
6491         /* NAPI wants this */
6492         INIT_LIST_HEAD(&dev->napi_list);
6493
6494         /* a dummy interface is started by default */
6495         set_bit(__LINK_STATE_PRESENT, &dev->state);
6496         set_bit(__LINK_STATE_START, &dev->state);
6497
6498         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6499          * because users of this 'device' dont need to change
6500          * its refcount.
6501          */
6502
6503         return 0;
6504 }
6505 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6506
6507
6508 /**
6509  *      register_netdev - register a network device
6510  *      @dev: device to register
6511  *
6512  *      Take a completed network device structure and add it to the kernel
6513  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6514  *      chain. 0 is returned on success. A negative errno code is returned
6515  *      on a failure to set up the device, or if the name is a duplicate.
6516  *
6517  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6518  *      and expands the device name if you passed a format string to
6519  *      alloc_netdev.
6520  */
6521 int register_netdev(struct net_device *dev)
6522 {
6523         int err;
6524
6525         rtnl_lock();
6526         err = register_netdevice(dev);
6527         rtnl_unlock();
6528         return err;
6529 }
6530 EXPORT_SYMBOL(register_netdev);
6531
6532 int netdev_refcnt_read(const struct net_device *dev)
6533 {
6534         int i, refcnt = 0;
6535
6536         for_each_possible_cpu(i)
6537                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6538         return refcnt;
6539 }
6540 EXPORT_SYMBOL(netdev_refcnt_read);
6541
6542 /**
6543  * netdev_wait_allrefs - wait until all references are gone.
6544  * @dev: target net_device
6545  *
6546  * This is called when unregistering network devices.
6547  *
6548  * Any protocol or device that holds a reference should register
6549  * for netdevice notification, and cleanup and put back the
6550  * reference if they receive an UNREGISTER event.
6551  * We can get stuck here if buggy protocols don't correctly
6552  * call dev_put.
6553  */
6554 static void netdev_wait_allrefs(struct net_device *dev)
6555 {
6556         unsigned long rebroadcast_time, warning_time;
6557         int refcnt;
6558
6559         linkwatch_forget_dev(dev);
6560
6561         rebroadcast_time = warning_time = jiffies;
6562         refcnt = netdev_refcnt_read(dev);
6563
6564         while (refcnt != 0) {
6565                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6566                         rtnl_lock();
6567
6568                         /* Rebroadcast unregister notification */
6569                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6570
6571                         __rtnl_unlock();
6572                         rcu_barrier();
6573                         rtnl_lock();
6574
6575                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6576                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6577                                      &dev->state)) {
6578                                 /* We must not have linkwatch events
6579                                  * pending on unregister. If this
6580                                  * happens, we simply run the queue
6581                                  * unscheduled, resulting in a noop
6582                                  * for this device.
6583                                  */
6584                                 linkwatch_run_queue();
6585                         }
6586
6587                         __rtnl_unlock();
6588
6589                         rebroadcast_time = jiffies;
6590                 }
6591
6592                 msleep(250);
6593
6594                 refcnt = netdev_refcnt_read(dev);
6595
6596                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6597                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6598                                  dev->name, refcnt);
6599                         warning_time = jiffies;
6600                 }
6601         }
6602 }
6603
6604 /* The sequence is:
6605  *
6606  *      rtnl_lock();
6607  *      ...
6608  *      register_netdevice(x1);
6609  *      register_netdevice(x2);
6610  *      ...
6611  *      unregister_netdevice(y1);
6612  *      unregister_netdevice(y2);
6613  *      ...
6614  *      rtnl_unlock();
6615  *      free_netdev(y1);
6616  *      free_netdev(y2);
6617  *
6618  * We are invoked by rtnl_unlock().
6619  * This allows us to deal with problems:
6620  * 1) We can delete sysfs objects which invoke hotplug
6621  *    without deadlocking with linkwatch via keventd.
6622  * 2) Since we run with the RTNL semaphore not held, we can sleep
6623  *    safely in order to wait for the netdev refcnt to drop to zero.
6624  *
6625  * We must not return until all unregister events added during
6626  * the interval the lock was held have been completed.
6627  */
6628 void netdev_run_todo(void)
6629 {
6630         struct list_head list;
6631
6632         /* Snapshot list, allow later requests */
6633         list_replace_init(&net_todo_list, &list);
6634
6635         __rtnl_unlock();
6636
6637
6638         /* Wait for rcu callbacks to finish before next phase */
6639         if (!list_empty(&list))
6640                 rcu_barrier();
6641
6642         while (!list_empty(&list)) {
6643                 struct net_device *dev
6644                         = list_first_entry(&list, struct net_device, todo_list);
6645                 list_del(&dev->todo_list);
6646
6647                 rtnl_lock();
6648                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6649                 __rtnl_unlock();
6650
6651                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6652                         pr_err("network todo '%s' but state %d\n",
6653                                dev->name, dev->reg_state);
6654                         dump_stack();
6655                         continue;
6656                 }
6657
6658                 dev->reg_state = NETREG_UNREGISTERED;
6659
6660                 on_each_cpu(flush_backlog, dev, 1);
6661
6662                 netdev_wait_allrefs(dev);
6663
6664                 /* paranoia */
6665                 BUG_ON(netdev_refcnt_read(dev));
6666                 BUG_ON(!list_empty(&dev->ptype_all));
6667                 BUG_ON(!list_empty(&dev->ptype_specific));
6668                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6669                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6670                 WARN_ON(dev->dn_ptr);
6671
6672                 if (dev->destructor)
6673                         dev->destructor(dev);
6674
6675                 /* Report a network device has been unregistered */
6676                 rtnl_lock();
6677                 dev_net(dev)->dev_unreg_count--;
6678                 __rtnl_unlock();
6679                 wake_up(&netdev_unregistering_wq);
6680
6681                 /* Free network device */
6682                 kobject_put(&dev->dev.kobj);
6683         }
6684 }
6685
6686 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6687  * fields in the same order, with only the type differing.
6688  */
6689 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6690                              const struct net_device_stats *netdev_stats)
6691 {
6692 #if BITS_PER_LONG == 64
6693         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6694         memcpy(stats64, netdev_stats, sizeof(*stats64));
6695 #else
6696         size_t i, n = sizeof(*stats64) / sizeof(u64);
6697         const unsigned long *src = (const unsigned long *)netdev_stats;
6698         u64 *dst = (u64 *)stats64;
6699
6700         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6701                      sizeof(*stats64) / sizeof(u64));
6702         for (i = 0; i < n; i++)
6703                 dst[i] = src[i];
6704 #endif
6705 }
6706 EXPORT_SYMBOL(netdev_stats_to_stats64);
6707
6708 /**
6709  *      dev_get_stats   - get network device statistics
6710  *      @dev: device to get statistics from
6711  *      @storage: place to store stats
6712  *
6713  *      Get network statistics from device. Return @storage.
6714  *      The device driver may provide its own method by setting
6715  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6716  *      otherwise the internal statistics structure is used.
6717  */
6718 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6719                                         struct rtnl_link_stats64 *storage)
6720 {
6721         const struct net_device_ops *ops = dev->netdev_ops;
6722
6723         if (ops->ndo_get_stats64) {
6724                 memset(storage, 0, sizeof(*storage));
6725                 ops->ndo_get_stats64(dev, storage);
6726         } else if (ops->ndo_get_stats) {
6727                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6728         } else {
6729                 netdev_stats_to_stats64(storage, &dev->stats);
6730         }
6731         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6732         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6733         return storage;
6734 }
6735 EXPORT_SYMBOL(dev_get_stats);
6736
6737 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6738 {
6739         struct netdev_queue *queue = dev_ingress_queue(dev);
6740
6741 #ifdef CONFIG_NET_CLS_ACT
6742         if (queue)
6743                 return queue;
6744         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6745         if (!queue)
6746                 return NULL;
6747         netdev_init_one_queue(dev, queue, NULL);
6748         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6749         queue->qdisc_sleeping = &noop_qdisc;
6750         rcu_assign_pointer(dev->ingress_queue, queue);
6751 #endif
6752         return queue;
6753 }
6754
6755 static const struct ethtool_ops default_ethtool_ops;
6756
6757 void netdev_set_default_ethtool_ops(struct net_device *dev,
6758                                     const struct ethtool_ops *ops)
6759 {
6760         if (dev->ethtool_ops == &default_ethtool_ops)
6761                 dev->ethtool_ops = ops;
6762 }
6763 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6764
6765 void netdev_freemem(struct net_device *dev)
6766 {
6767         char *addr = (char *)dev - dev->padded;
6768
6769         kvfree(addr);
6770 }
6771
6772 /**
6773  *      alloc_netdev_mqs - allocate network device
6774  *      @sizeof_priv:           size of private data to allocate space for
6775  *      @name:                  device name format string
6776  *      @name_assign_type:      origin of device name
6777  *      @setup:                 callback to initialize device
6778  *      @txqs:                  the number of TX subqueues to allocate
6779  *      @rxqs:                  the number of RX subqueues to allocate
6780  *
6781  *      Allocates a struct net_device with private data area for driver use
6782  *      and performs basic initialization.  Also allocates subqueue structs
6783  *      for each queue on the device.
6784  */
6785 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6786                 unsigned char name_assign_type,
6787                 void (*setup)(struct net_device *),
6788                 unsigned int txqs, unsigned int rxqs)
6789 {
6790         struct net_device *dev;
6791         size_t alloc_size;
6792         struct net_device *p;
6793
6794         BUG_ON(strlen(name) >= sizeof(dev->name));
6795
6796         if (txqs < 1) {
6797                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6798                 return NULL;
6799         }
6800
6801 #ifdef CONFIG_SYSFS
6802         if (rxqs < 1) {
6803                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6804                 return NULL;
6805         }
6806 #endif
6807
6808         alloc_size = sizeof(struct net_device);
6809         if (sizeof_priv) {
6810                 /* ensure 32-byte alignment of private area */
6811                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6812                 alloc_size += sizeof_priv;
6813         }
6814         /* ensure 32-byte alignment of whole construct */
6815         alloc_size += NETDEV_ALIGN - 1;
6816
6817         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6818         if (!p)
6819                 p = vzalloc(alloc_size);
6820         if (!p)
6821                 return NULL;
6822
6823         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6824         dev->padded = (char *)dev - (char *)p;
6825
6826         dev->pcpu_refcnt = alloc_percpu(int);
6827         if (!dev->pcpu_refcnt)
6828                 goto free_dev;
6829
6830         if (dev_addr_init(dev))
6831                 goto free_pcpu;
6832
6833         dev_mc_init(dev);
6834         dev_uc_init(dev);
6835
6836         dev_net_set(dev, &init_net);
6837
6838         dev->gso_max_size = GSO_MAX_SIZE;
6839         dev->gso_max_segs = GSO_MAX_SEGS;
6840         dev->gso_min_segs = 0;
6841
6842         INIT_LIST_HEAD(&dev->napi_list);
6843         INIT_LIST_HEAD(&dev->unreg_list);
6844         INIT_LIST_HEAD(&dev->close_list);
6845         INIT_LIST_HEAD(&dev->link_watch_list);
6846         INIT_LIST_HEAD(&dev->adj_list.upper);
6847         INIT_LIST_HEAD(&dev->adj_list.lower);
6848         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6849         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6850         INIT_LIST_HEAD(&dev->ptype_all);
6851         INIT_LIST_HEAD(&dev->ptype_specific);
6852         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6853         setup(dev);
6854
6855         dev->num_tx_queues = txqs;
6856         dev->real_num_tx_queues = txqs;
6857         if (netif_alloc_netdev_queues(dev))
6858                 goto free_all;
6859
6860 #ifdef CONFIG_SYSFS
6861         dev->num_rx_queues = rxqs;
6862         dev->real_num_rx_queues = rxqs;
6863         if (netif_alloc_rx_queues(dev))
6864                 goto free_all;
6865 #endif
6866
6867         strcpy(dev->name, name);
6868         dev->name_assign_type = name_assign_type;
6869         dev->group = INIT_NETDEV_GROUP;
6870         if (!dev->ethtool_ops)
6871                 dev->ethtool_ops = &default_ethtool_ops;
6872         return dev;
6873
6874 free_all:
6875         free_netdev(dev);
6876         return NULL;
6877
6878 free_pcpu:
6879         free_percpu(dev->pcpu_refcnt);
6880 free_dev:
6881         netdev_freemem(dev);
6882         return NULL;
6883 }
6884 EXPORT_SYMBOL(alloc_netdev_mqs);
6885
6886 /**
6887  *      free_netdev - free network device
6888  *      @dev: device
6889  *
6890  *      This function does the last stage of destroying an allocated device
6891  *      interface. The reference to the device object is released.
6892  *      If this is the last reference then it will be freed.
6893  */
6894 void free_netdev(struct net_device *dev)
6895 {
6896         struct napi_struct *p, *n;
6897
6898         netif_free_tx_queues(dev);
6899 #ifdef CONFIG_SYSFS
6900         kvfree(dev->_rx);
6901 #endif
6902
6903         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6904
6905         /* Flush device addresses */
6906         dev_addr_flush(dev);
6907
6908         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6909                 netif_napi_del(p);
6910
6911         free_percpu(dev->pcpu_refcnt);
6912         dev->pcpu_refcnt = NULL;
6913
6914         /*  Compatibility with error handling in drivers */
6915         if (dev->reg_state == NETREG_UNINITIALIZED) {
6916                 netdev_freemem(dev);
6917                 return;
6918         }
6919
6920         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6921         dev->reg_state = NETREG_RELEASED;
6922
6923         /* will free via device release */
6924         put_device(&dev->dev);
6925 }
6926 EXPORT_SYMBOL(free_netdev);
6927
6928 /**
6929  *      synchronize_net -  Synchronize with packet receive processing
6930  *
6931  *      Wait for packets currently being received to be done.
6932  *      Does not block later packets from starting.
6933  */
6934 void synchronize_net(void)
6935 {
6936         might_sleep();
6937         if (rtnl_is_locked())
6938                 synchronize_rcu_expedited();
6939         else
6940                 synchronize_rcu();
6941 }
6942 EXPORT_SYMBOL(synchronize_net);
6943
6944 /**
6945  *      unregister_netdevice_queue - remove device from the kernel
6946  *      @dev: device
6947  *      @head: list
6948  *
6949  *      This function shuts down a device interface and removes it
6950  *      from the kernel tables.
6951  *      If head not NULL, device is queued to be unregistered later.
6952  *
6953  *      Callers must hold the rtnl semaphore.  You may want
6954  *      unregister_netdev() instead of this.
6955  */
6956
6957 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6958 {
6959         ASSERT_RTNL();
6960
6961         if (head) {
6962                 list_move_tail(&dev->unreg_list, head);
6963         } else {
6964                 rollback_registered(dev);
6965                 /* Finish processing unregister after unlock */
6966                 net_set_todo(dev);
6967         }
6968 }
6969 EXPORT_SYMBOL(unregister_netdevice_queue);
6970
6971 /**
6972  *      unregister_netdevice_many - unregister many devices
6973  *      @head: list of devices
6974  *
6975  *  Note: As most callers use a stack allocated list_head,
6976  *  we force a list_del() to make sure stack wont be corrupted later.
6977  */
6978 void unregister_netdevice_many(struct list_head *head)
6979 {
6980         struct net_device *dev;
6981
6982         if (!list_empty(head)) {
6983                 rollback_registered_many(head);
6984                 list_for_each_entry(dev, head, unreg_list)
6985                         net_set_todo(dev);
6986                 list_del(head);
6987         }
6988 }
6989 EXPORT_SYMBOL(unregister_netdevice_many);
6990
6991 /**
6992  *      unregister_netdev - remove device from the kernel
6993  *      @dev: device
6994  *
6995  *      This function shuts down a device interface and removes it
6996  *      from the kernel tables.
6997  *
6998  *      This is just a wrapper for unregister_netdevice that takes
6999  *      the rtnl semaphore.  In general you want to use this and not
7000  *      unregister_netdevice.
7001  */
7002 void unregister_netdev(struct net_device *dev)
7003 {
7004         rtnl_lock();
7005         unregister_netdevice(dev);
7006         rtnl_unlock();
7007 }
7008 EXPORT_SYMBOL(unregister_netdev);
7009
7010 /**
7011  *      dev_change_net_namespace - move device to different nethost namespace
7012  *      @dev: device
7013  *      @net: network namespace
7014  *      @pat: If not NULL name pattern to try if the current device name
7015  *            is already taken in the destination network namespace.
7016  *
7017  *      This function shuts down a device interface and moves it
7018  *      to a new network namespace. On success 0 is returned, on
7019  *      a failure a netagive errno code is returned.
7020  *
7021  *      Callers must hold the rtnl semaphore.
7022  */
7023
7024 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7025 {
7026         int err;
7027
7028         ASSERT_RTNL();
7029
7030         /* Don't allow namespace local devices to be moved. */
7031         err = -EINVAL;
7032         if (dev->features & NETIF_F_NETNS_LOCAL)
7033                 goto out;
7034
7035         /* Ensure the device has been registrered */
7036         if (dev->reg_state != NETREG_REGISTERED)
7037                 goto out;
7038
7039         /* Get out if there is nothing todo */
7040         err = 0;
7041         if (net_eq(dev_net(dev), net))
7042                 goto out;
7043
7044         /* Pick the destination device name, and ensure
7045          * we can use it in the destination network namespace.
7046          */
7047         err = -EEXIST;
7048         if (__dev_get_by_name(net, dev->name)) {
7049                 /* We get here if we can't use the current device name */
7050                 if (!pat)
7051                         goto out;
7052                 if (dev_get_valid_name(net, dev, pat) < 0)
7053                         goto out;
7054         }
7055
7056         /*
7057          * And now a mini version of register_netdevice unregister_netdevice.
7058          */
7059
7060         /* If device is running close it first. */
7061         dev_close(dev);
7062
7063         /* And unlink it from device chain */
7064         err = -ENODEV;
7065         unlist_netdevice(dev);
7066
7067         synchronize_net();
7068
7069         /* Shutdown queueing discipline. */
7070         dev_shutdown(dev);
7071
7072         /* Notify protocols, that we are about to destroy
7073            this device. They should clean all the things.
7074
7075            Note that dev->reg_state stays at NETREG_REGISTERED.
7076            This is wanted because this way 8021q and macvlan know
7077            the device is just moving and can keep their slaves up.
7078         */
7079         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7080         rcu_barrier();
7081         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7082         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7083
7084         /*
7085          *      Flush the unicast and multicast chains
7086          */
7087         dev_uc_flush(dev);
7088         dev_mc_flush(dev);
7089
7090         /* Send a netdev-removed uevent to the old namespace */
7091         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7092         netdev_adjacent_del_links(dev);
7093
7094         /* Actually switch the network namespace */
7095         dev_net_set(dev, net);
7096
7097         /* If there is an ifindex conflict assign a new one */
7098         if (__dev_get_by_index(net, dev->ifindex))
7099                 dev->ifindex = dev_new_index(net);
7100
7101         /* Send a netdev-add uevent to the new namespace */
7102         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7103         netdev_adjacent_add_links(dev);
7104
7105         /* Fixup kobjects */
7106         err = device_rename(&dev->dev, dev->name);
7107         WARN_ON(err);
7108
7109         /* Add the device back in the hashes */
7110         list_netdevice(dev);
7111
7112         /* Notify protocols, that a new device appeared. */
7113         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7114
7115         /*
7116          *      Prevent userspace races by waiting until the network
7117          *      device is fully setup before sending notifications.
7118          */
7119         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7120
7121         synchronize_net();
7122         err = 0;
7123 out:
7124         return err;
7125 }
7126 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7127
7128 static int dev_cpu_callback(struct notifier_block *nfb,
7129                             unsigned long action,
7130                             void *ocpu)
7131 {
7132         struct sk_buff **list_skb;
7133         struct sk_buff *skb;
7134         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7135         struct softnet_data *sd, *oldsd;
7136
7137         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7138                 return NOTIFY_OK;
7139
7140         local_irq_disable();
7141         cpu = smp_processor_id();
7142         sd = &per_cpu(softnet_data, cpu);
7143         oldsd = &per_cpu(softnet_data, oldcpu);
7144
7145         /* Find end of our completion_queue. */
7146         list_skb = &sd->completion_queue;
7147         while (*list_skb)
7148                 list_skb = &(*list_skb)->next;
7149         /* Append completion queue from offline CPU. */
7150         *list_skb = oldsd->completion_queue;
7151         oldsd->completion_queue = NULL;
7152
7153         /* Append output queue from offline CPU. */
7154         if (oldsd->output_queue) {
7155                 *sd->output_queue_tailp = oldsd->output_queue;
7156                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7157                 oldsd->output_queue = NULL;
7158                 oldsd->output_queue_tailp = &oldsd->output_queue;
7159         }
7160         /* Append NAPI poll list from offline CPU, with one exception :
7161          * process_backlog() must be called by cpu owning percpu backlog.
7162          * We properly handle process_queue & input_pkt_queue later.
7163          */
7164         while (!list_empty(&oldsd->poll_list)) {
7165                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7166                                                             struct napi_struct,
7167                                                             poll_list);
7168
7169                 list_del_init(&napi->poll_list);
7170                 if (napi->poll == process_backlog)
7171                         napi->state = 0;
7172                 else
7173                         ____napi_schedule(sd, napi);
7174         }
7175
7176         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7177         local_irq_enable();
7178
7179         /* Process offline CPU's input_pkt_queue */
7180         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7181                 netif_rx_ni(skb);
7182                 input_queue_head_incr(oldsd);
7183         }
7184         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7185                 netif_rx_ni(skb);
7186                 input_queue_head_incr(oldsd);
7187         }
7188
7189         return NOTIFY_OK;
7190 }
7191
7192
7193 /**
7194  *      netdev_increment_features - increment feature set by one
7195  *      @all: current feature set
7196  *      @one: new feature set
7197  *      @mask: mask feature set
7198  *
7199  *      Computes a new feature set after adding a device with feature set
7200  *      @one to the master device with current feature set @all.  Will not
7201  *      enable anything that is off in @mask. Returns the new feature set.
7202  */
7203 netdev_features_t netdev_increment_features(netdev_features_t all,
7204         netdev_features_t one, netdev_features_t mask)
7205 {
7206         if (mask & NETIF_F_GEN_CSUM)
7207                 mask |= NETIF_F_ALL_CSUM;
7208         mask |= NETIF_F_VLAN_CHALLENGED;
7209
7210         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7211         all &= one | ~NETIF_F_ALL_FOR_ALL;
7212
7213         /* If one device supports hw checksumming, set for all. */
7214         if (all & NETIF_F_GEN_CSUM)
7215                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7216
7217         return all;
7218 }
7219 EXPORT_SYMBOL(netdev_increment_features);
7220
7221 static struct hlist_head * __net_init netdev_create_hash(void)
7222 {
7223         int i;
7224         struct hlist_head *hash;
7225
7226         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7227         if (hash != NULL)
7228                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7229                         INIT_HLIST_HEAD(&hash[i]);
7230
7231         return hash;
7232 }
7233
7234 /* Initialize per network namespace state */
7235 static int __net_init netdev_init(struct net *net)
7236 {
7237         if (net != &init_net)
7238                 INIT_LIST_HEAD(&net->dev_base_head);
7239
7240         net->dev_name_head = netdev_create_hash();
7241         if (net->dev_name_head == NULL)
7242                 goto err_name;
7243
7244         net->dev_index_head = netdev_create_hash();
7245         if (net->dev_index_head == NULL)
7246                 goto err_idx;
7247
7248         return 0;
7249
7250 err_idx:
7251         kfree(net->dev_name_head);
7252 err_name:
7253         return -ENOMEM;
7254 }
7255
7256 /**
7257  *      netdev_drivername - network driver for the device
7258  *      @dev: network device
7259  *
7260  *      Determine network driver for device.
7261  */
7262 const char *netdev_drivername(const struct net_device *dev)
7263 {
7264         const struct device_driver *driver;
7265         const struct device *parent;
7266         const char *empty = "";
7267
7268         parent = dev->dev.parent;
7269         if (!parent)
7270                 return empty;
7271
7272         driver = parent->driver;
7273         if (driver && driver->name)
7274                 return driver->name;
7275         return empty;
7276 }
7277
7278 static void __netdev_printk(const char *level, const struct net_device *dev,
7279                             struct va_format *vaf)
7280 {
7281         if (dev && dev->dev.parent) {
7282                 dev_printk_emit(level[1] - '0',
7283                                 dev->dev.parent,
7284                                 "%s %s %s%s: %pV",
7285                                 dev_driver_string(dev->dev.parent),
7286                                 dev_name(dev->dev.parent),
7287                                 netdev_name(dev), netdev_reg_state(dev),
7288                                 vaf);
7289         } else if (dev) {
7290                 printk("%s%s%s: %pV",
7291                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7292         } else {
7293                 printk("%s(NULL net_device): %pV", level, vaf);
7294         }
7295 }
7296
7297 void netdev_printk(const char *level, const struct net_device *dev,
7298                    const char *format, ...)
7299 {
7300         struct va_format vaf;
7301         va_list args;
7302
7303         va_start(args, format);
7304
7305         vaf.fmt = format;
7306         vaf.va = &args;
7307
7308         __netdev_printk(level, dev, &vaf);
7309
7310         va_end(args);
7311 }
7312 EXPORT_SYMBOL(netdev_printk);
7313
7314 #define define_netdev_printk_level(func, level)                 \
7315 void func(const struct net_device *dev, const char *fmt, ...)   \
7316 {                                                               \
7317         struct va_format vaf;                                   \
7318         va_list args;                                           \
7319                                                                 \
7320         va_start(args, fmt);                                    \
7321                                                                 \
7322         vaf.fmt = fmt;                                          \
7323         vaf.va = &args;                                         \
7324                                                                 \
7325         __netdev_printk(level, dev, &vaf);                      \
7326                                                                 \
7327         va_end(args);                                           \
7328 }                                                               \
7329 EXPORT_SYMBOL(func);
7330
7331 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7332 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7333 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7334 define_netdev_printk_level(netdev_err, KERN_ERR);
7335 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7336 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7337 define_netdev_printk_level(netdev_info, KERN_INFO);
7338
7339 static void __net_exit netdev_exit(struct net *net)
7340 {
7341         kfree(net->dev_name_head);
7342         kfree(net->dev_index_head);
7343 }
7344
7345 static struct pernet_operations __net_initdata netdev_net_ops = {
7346         .init = netdev_init,
7347         .exit = netdev_exit,
7348 };
7349
7350 static void __net_exit default_device_exit(struct net *net)
7351 {
7352         struct net_device *dev, *aux;
7353         /*
7354          * Push all migratable network devices back to the
7355          * initial network namespace
7356          */
7357         rtnl_lock();
7358         for_each_netdev_safe(net, dev, aux) {
7359                 int err;
7360                 char fb_name[IFNAMSIZ];
7361
7362                 /* Ignore unmoveable devices (i.e. loopback) */
7363                 if (dev->features & NETIF_F_NETNS_LOCAL)
7364                         continue;
7365
7366                 /* Leave virtual devices for the generic cleanup */
7367                 if (dev->rtnl_link_ops)
7368                         continue;
7369
7370                 /* Push remaining network devices to init_net */
7371                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7372                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7373                 if (err) {
7374                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7375                                  __func__, dev->name, err);
7376                         BUG();
7377                 }
7378         }
7379         rtnl_unlock();
7380 }
7381
7382 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7383 {
7384         /* Return with the rtnl_lock held when there are no network
7385          * devices unregistering in any network namespace in net_list.
7386          */
7387         struct net *net;
7388         bool unregistering;
7389         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7390
7391         add_wait_queue(&netdev_unregistering_wq, &wait);
7392         for (;;) {
7393                 unregistering = false;
7394                 rtnl_lock();
7395                 list_for_each_entry(net, net_list, exit_list) {
7396                         if (net->dev_unreg_count > 0) {
7397                                 unregistering = true;
7398                                 break;
7399                         }
7400                 }
7401                 if (!unregistering)
7402                         break;
7403                 __rtnl_unlock();
7404
7405                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7406         }
7407         remove_wait_queue(&netdev_unregistering_wq, &wait);
7408 }
7409
7410 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7411 {
7412         /* At exit all network devices most be removed from a network
7413          * namespace.  Do this in the reverse order of registration.
7414          * Do this across as many network namespaces as possible to
7415          * improve batching efficiency.
7416          */
7417         struct net_device *dev;
7418         struct net *net;
7419         LIST_HEAD(dev_kill_list);
7420
7421         /* To prevent network device cleanup code from dereferencing
7422          * loopback devices or network devices that have been freed
7423          * wait here for all pending unregistrations to complete,
7424          * before unregistring the loopback device and allowing the
7425          * network namespace be freed.
7426          *
7427          * The netdev todo list containing all network devices
7428          * unregistrations that happen in default_device_exit_batch
7429          * will run in the rtnl_unlock() at the end of
7430          * default_device_exit_batch.
7431          */
7432         rtnl_lock_unregistering(net_list);
7433         list_for_each_entry(net, net_list, exit_list) {
7434                 for_each_netdev_reverse(net, dev) {
7435                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7436                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7437                         else
7438                                 unregister_netdevice_queue(dev, &dev_kill_list);
7439                 }
7440         }
7441         unregister_netdevice_many(&dev_kill_list);
7442         rtnl_unlock();
7443 }
7444
7445 static struct pernet_operations __net_initdata default_device_ops = {
7446         .exit = default_device_exit,
7447         .exit_batch = default_device_exit_batch,
7448 };
7449
7450 /*
7451  *      Initialize the DEV module. At boot time this walks the device list and
7452  *      unhooks any devices that fail to initialise (normally hardware not
7453  *      present) and leaves us with a valid list of present and active devices.
7454  *
7455  */
7456
7457 /*
7458  *       This is called single threaded during boot, so no need
7459  *       to take the rtnl semaphore.
7460  */
7461 static int __init net_dev_init(void)
7462 {
7463         int i, rc = -ENOMEM;
7464
7465         BUG_ON(!dev_boot_phase);
7466
7467         if (dev_proc_init())
7468                 goto out;
7469
7470         if (netdev_kobject_init())
7471                 goto out;
7472
7473         INIT_LIST_HEAD(&ptype_all);
7474         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7475                 INIT_LIST_HEAD(&ptype_base[i]);
7476
7477         INIT_LIST_HEAD(&offload_base);
7478
7479         if (register_pernet_subsys(&netdev_net_ops))
7480                 goto out;
7481
7482         /*
7483          *      Initialise the packet receive queues.
7484          */
7485
7486         for_each_possible_cpu(i) {
7487                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7488
7489                 skb_queue_head_init(&sd->input_pkt_queue);
7490                 skb_queue_head_init(&sd->process_queue);
7491                 INIT_LIST_HEAD(&sd->poll_list);
7492                 sd->output_queue_tailp = &sd->output_queue;
7493 #ifdef CONFIG_RPS
7494                 sd->csd.func = rps_trigger_softirq;
7495                 sd->csd.info = sd;
7496                 sd->cpu = i;
7497 #endif
7498
7499                 sd->backlog.poll = process_backlog;
7500                 sd->backlog.weight = weight_p;
7501         }
7502
7503         dev_boot_phase = 0;
7504
7505         /* The loopback device is special if any other network devices
7506          * is present in a network namespace the loopback device must
7507          * be present. Since we now dynamically allocate and free the
7508          * loopback device ensure this invariant is maintained by
7509          * keeping the loopback device as the first device on the
7510          * list of network devices.  Ensuring the loopback devices
7511          * is the first device that appears and the last network device
7512          * that disappears.
7513          */
7514         if (register_pernet_device(&loopback_net_ops))
7515                 goto out;
7516
7517         if (register_pernet_device(&default_device_ops))
7518                 goto out;
7519
7520         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7521         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7522
7523         hotcpu_notifier(dev_cpu_callback, 0);
7524         dst_init();
7525         rc = 0;
7526 out:
7527         return rc;
7528 }
7529
7530 subsys_initcall(net_dev_init);