net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/proc_fs.h>
 101 #include <linux/seq_file.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <net/xfrm.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/net_tstamp.h>
 136 #include <linux/static_key.h>
 137 #include <net/flow_keys.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 /*
 148  *      The list of packet types we will receive (as opposed to discard)
 149  *      and the routines to invoke.
 150  *
 151  *      Why 16. Because with 16 the only overlap we get on a hash of the
 152  *      low nibble of the protocol value is RARP/SNAP/X.25.
 153  *
 154  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 155  *             sure which should go first, but I bet it won't make much
 156  *             difference if we are running VLANs.  The good news is that
 157  *             this protocol won't be in the list unless compiled in, so
 158  *             the average user (w/out VLANs) will not be adversely affected.
 159  *             --BLG
 160  *
 161  *              0800    IP
 162  *              8100    802.1Q VLAN
 163  *              0001    802.3
 164  *              0002    AX.25
 165  *              0004    802.2
 166  *              8035    RARP
 167  *              0005    SNAP
 168  *              0805    X.25
 169  *              0806    ARP
 170  *              8137    IPX
 171  *              0009    Localtalk
 172  *              86DD    IPv6
 173  */
 174
 175 #define PTYPE_HASH_SIZE (16)
 176 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 177
 178 static DEFINE_SPINLOCK(ptype_lock);
 179 static DEFINE_SPINLOCK(offload_lock);
 180 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 181 static struct list_head ptype_all __read_mostly;        /* Taps */
 182 static struct list_head offload_base __read_mostly;
 183
 184 /*
 185  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 186  * semaphore.
 187  *
 188  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 189  *
 190  * Writers must hold the rtnl semaphore while they loop through the
 191  * dev_base_head list, and hold dev_base_lock for writing when they do the
 192  * actual updates.  This allows pure readers to access the list even
 193  * while a writer is preparing to update it.
 194  *
 195  * To put it another way, dev_base_lock is held for writing only to
 196  * protect against pure readers; the rtnl semaphore provides the
 197  * protection against other writers.
 198  *
 199  * See, for example usages, register_netdevice() and
 200  * unregister_netdevice(), which must be called with the rtnl
 201  * semaphore held.
 202  */
 203 DEFINE_RWLOCK(dev_base_lock);
 204 EXPORT_SYMBOL(dev_base_lock);
 205
 206 static inline void dev_base_seq_inc(struct net *net)
 207 {
 208         while (++net->dev_base_seq == 0);
 209 }
 210
 211 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 212 {
 213         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 214
 215         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 216 }
 217
 218 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 219 {
 220         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 221 }
 222
 223 static inline void rps_lock(struct softnet_data *sd)
 224 {
 225 #ifdef CONFIG_RPS
 226         spin_lock(&sd->input_pkt_queue.lock);
 227 #endif
 228 }
 229
 230 static inline void rps_unlock(struct softnet_data *sd)
 231 {
 232 #ifdef CONFIG_RPS
 233         spin_unlock(&sd->input_pkt_queue.lock);
 234 #endif
 235 }
 236
 237 /* Device list insertion */
 238 static int list_netdevice(struct net_device *dev)
 239 {
 240         struct net *net = dev_net(dev);
 241
 242         ASSERT_RTNL();
 243
 244         write_lock_bh(&dev_base_lock);
 245         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 246         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 247         hlist_add_head_rcu(&dev->index_hlist,
 248                            dev_index_hash(net, dev->ifindex));
 249         write_unlock_bh(&dev_base_lock);
 250
 251         dev_base_seq_inc(net);
 252
 253         return 0;
 254 }
 255
 256 /* Device list removal
 257  * caller must respect a RCU grace period before freeing/reusing dev
 258  */
 259 static void unlist_netdevice(struct net_device *dev)
 260 {
 261         ASSERT_RTNL();
 262
 263         /* Unlink dev from the device chain */
 264         write_lock_bh(&dev_base_lock);
 265         list_del_rcu(&dev->dev_list);
 266         hlist_del_rcu(&dev->name_hlist);
 267         hlist_del_rcu(&dev->index_hlist);
 268         write_unlock_bh(&dev_base_lock);
 269
 270         dev_base_seq_inc(dev_net(dev));
 271 }
 272
 273 /*
 274  *      Our notifier list
 275  */
 276
 277 static RAW_NOTIFIER_HEAD(netdev_chain);
 278
 279 /*
 280  *      Device drivers call our routines to queue packets here. We empty the
 281  *      queue in the local softnet handler.
 282  */
 283
 284 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 285 EXPORT_PER_CPU_SYMBOL(softnet_data);
 286
 287 #ifdef CONFIG_LOCKDEP
 288 /*
 289  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 290  * according to dev->type
 291  */
 292 static const unsigned short netdev_lock_type[] =
 293         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 294          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 295          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 296          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 297          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 298          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 299          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 300          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 301          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 302          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 303          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 304          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 305          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 306          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 307          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 308
 309 static const char *const netdev_lock_name[] =
 310         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 311          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 312          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 313          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 314          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 315          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 316          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 317          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 318          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 319          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 320          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 321          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 322          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 323          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 324          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 325
 326 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 327 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328
 329 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 330 {
 331         int i;
 332
 333         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 334                 if (netdev_lock_type[i] == dev_type)
 335                         return i;
 336         /* the last key is used by default */
 337         return ARRAY_SIZE(netdev_lock_type) - 1;
 338 }
 339
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343         int i;
 344
 345         i = netdev_lock_pos(dev_type);
 346         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 347                                    netdev_lock_name[i]);
 348 }
 349
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352         int i;
 353
 354         i = netdev_lock_pos(dev->type);
 355         lockdep_set_class_and_name(&dev->addr_list_lock,
 356                                    &netdev_addr_lock_key[i],
 357                                    netdev_lock_name[i]);
 358 }
 359 #else
 360 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 361                                                  unsigned short dev_type)
 362 {
 363 }
 364 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 365 {
 366 }
 367 #endif
 368
 369 /*******************************************************************************
 370
 371                 Protocol management and registration routines
 372
 373 *******************************************************************************/
 374
 375 /*
 376  *      Add a protocol ID to the list. Now that the input handler is
 377  *      smarter we can dispense with all the messy stuff that used to be
 378  *      here.
 379  *
 380  *      BEWARE!!! Protocol handlers, mangling input packets,
 381  *      MUST BE last in hash buckets and checking protocol handlers
 382  *      MUST start from promiscuous ptype_all chain in net_bh.
 383  *      It is true now, do not change it.
 384  *      Explanation follows: if protocol handler, mangling packet, will
 385  *      be the first on list, it is not able to sense, that packet
 386  *      is cloned and should be copied-on-write, so that it will
 387  *      change it and subsequent readers will get broken packet.
 388  *                                                      --ANK (980803)
 389  */
 390
 391 static inline struct list_head *ptype_head(const struct packet_type *pt)
 392 {
 393         if (pt->type == htons(ETH_P_ALL))
 394                 return &ptype_all;
 395         else
 396                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 397 }
 398
 399 /**
 400  *      dev_add_pack - add packet handler
 401  *      @pt: packet type declaration
 402  *
 403  *      Add a protocol handler to the networking stack. The passed &packet_type
 404  *      is linked into kernel lists and may not be freed until it has been
 405  *      removed from the kernel lists.
 406  *
 407  *      This call does not sleep therefore it can not
 408  *      guarantee all CPU's that are in middle of receiving packets
 409  *      will see the new packet type (until the next received packet).
 410  */
 411
 412 void dev_add_pack(struct packet_type *pt)
 413 {
 414         struct list_head *head = ptype_head(pt);
 415
 416         spin_lock(&ptype_lock);
 417         list_add_rcu(&pt->list, head);
 418         spin_unlock(&ptype_lock);
 419 }
 420 EXPORT_SYMBOL(dev_add_pack);
 421
 422 /**
 423  *      __dev_remove_pack        - remove packet handler
 424  *      @pt: packet type declaration
 425  *
 426  *      Remove a protocol handler that was previously added to the kernel
 427  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 428  *      from the kernel lists and can be freed or reused once this function
 429  *      returns.
 430  *
 431  *      The packet type might still be in use by receivers
 432  *      and must not be freed until after all the CPU's have gone
 433  *      through a quiescent state.
 434  */
 435 void __dev_remove_pack(struct packet_type *pt)
 436 {
 437         struct list_head *head = ptype_head(pt);
 438         struct packet_type *pt1;
 439
 440         spin_lock(&ptype_lock);
 441
 442         list_for_each_entry(pt1, head, list) {
 443                 if (pt == pt1) {
 444                         list_del_rcu(&pt->list);
 445                         goto out;
 446                 }
 447         }
 448
 449         pr_warn("dev_remove_pack: %p not found\n", pt);
 450 out:
 451         spin_unlock(&ptype_lock);
 452 }
 453 EXPORT_SYMBOL(__dev_remove_pack);
 454
 455 /**
 456  *      dev_remove_pack  - remove packet handler
 457  *      @pt: packet type declaration
 458  *
 459  *      Remove a protocol handler that was previously added to the kernel
 460  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 461  *      from the kernel lists and can be freed or reused once this function
 462  *      returns.
 463  *
 464  *      This call sleeps to guarantee that no CPU is looking at the packet
 465  *      type after return.
 466  */
 467 void dev_remove_pack(struct packet_type *pt)
 468 {
 469         __dev_remove_pack(pt);
 470
 471         synchronize_net();
 472 }
 473 EXPORT_SYMBOL(dev_remove_pack);
 474
 475
 476 /**
 477  *      dev_add_offload - register offload handlers
 478  *      @po: protocol offload declaration
 479  *
 480  *      Add protocol offload handlers to the networking stack. The passed
 481  *      &proto_offload is linked into kernel lists and may not be freed until
 482  *      it has been removed from the kernel lists.
 483  *
 484  *      This call does not sleep therefore it can not
 485  *      guarantee all CPU's that are in middle of receiving packets
 486  *      will see the new offload handlers (until the next received packet).
 487  */
 488 void dev_add_offload(struct packet_offload *po)
 489 {
 490         struct list_head *head = &offload_base;
 491
 492         spin_lock(&offload_lock);
 493         list_add_rcu(&po->list, head);
 494         spin_unlock(&offload_lock);
 495 }
 496 EXPORT_SYMBOL(dev_add_offload);
 497
 498 /**
 499  *      __dev_remove_offload     - remove offload handler
 500  *      @po: packet offload declaration
 501  *
 502  *      Remove a protocol offload handler that was previously added to the
 503  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 504  *      is removed from the kernel lists and can be freed or reused once this
 505  *      function returns.
 506  *
 507  *      The packet type might still be in use by receivers
 508  *      and must not be freed until after all the CPU's have gone
 509  *      through a quiescent state.
 510  */
 511 void __dev_remove_offload(struct packet_offload *po)
 512 {
 513         struct list_head *head = &offload_base;
 514         struct packet_offload *po1;
 515
 516         spin_lock(&offload_lock);
 517
 518         list_for_each_entry(po1, head, list) {
 519                 if (po == po1) {
 520                         list_del_rcu(&po->list);
 521                         goto out;
 522                 }
 523         }
 524
 525         pr_warn("dev_remove_offload: %p not found\n", po);
 526 out:
 527         spin_unlock(&offload_lock);
 528 }
 529 EXPORT_SYMBOL(__dev_remove_offload);
 530
 531 /**
 532  *      dev_remove_offload       - remove packet offload handler
 533  *      @po: packet offload declaration
 534  *
 535  *      Remove a packet offload handler that was previously added to the kernel
 536  *      offload handlers by dev_add_offload(). The passed &offload_type is
 537  *      removed from the kernel lists and can be freed or reused once this
 538  *      function returns.
 539  *
 540  *      This call sleeps to guarantee that no CPU is looking at the packet
 541  *      type after return.
 542  */
 543 void dev_remove_offload(struct packet_offload *po)
 544 {
 545         __dev_remove_offload(po);
 546
 547         synchronize_net();
 548 }
 549 EXPORT_SYMBOL(dev_remove_offload);
 550
 551 /******************************************************************************
 552
 553                       Device Boot-time Settings Routines
 554
 555 *******************************************************************************/
 556
 557 /* Boot time configuration table */
 558 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 559
 560 /**
 561  *      netdev_boot_setup_add   - add new setup entry
 562  *      @name: name of the device
 563  *      @map: configured settings for the device
 564  *
 565  *      Adds new setup entry to the dev_boot_setup list.  The function
 566  *      returns 0 on error and 1 on success.  This is a generic routine to
 567  *      all netdevices.
 568  */
 569 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 570 {
 571         struct netdev_boot_setup *s;
 572         int i;
 573
 574         s = dev_boot_setup;
 575         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 576                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 577                         memset(s[i].name, 0, sizeof(s[i].name));
 578                         strlcpy(s[i].name, name, IFNAMSIZ);
 579                         memcpy(&s[i].map, map, sizeof(s[i].map));
 580                         break;
 581                 }
 582         }
 583
 584         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 585 }
 586
 587 /**
 588  *      netdev_boot_setup_check - check boot time settings
 589  *      @dev: the netdevice
 590  *
 591  *      Check boot time settings for the device.
 592  *      The found settings are set for the device to be used
 593  *      later in the device probing.
 594  *      Returns 0 if no settings found, 1 if they are.
 595  */
 596 int netdev_boot_setup_check(struct net_device *dev)
 597 {
 598         struct netdev_boot_setup *s = dev_boot_setup;
 599         int i;
 600
 601         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 602                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 603                     !strcmp(dev->name, s[i].name)) {
 604                         dev->irq        = s[i].map.irq;
 605                         dev->base_addr  = s[i].map.base_addr;
 606                         dev->mem_start  = s[i].map.mem_start;
 607                         dev->mem_end    = s[i].map.mem_end;
 608                         return 1;
 609                 }
 610         }
 611         return 0;
 612 }
 613 EXPORT_SYMBOL(netdev_boot_setup_check);
 614
 615
 616 /**
 617  *      netdev_boot_base        - get address from boot time settings
 618  *      @prefix: prefix for network device
 619  *      @unit: id for network device
 620  *
 621  *      Check boot time settings for the base address of device.
 622  *      The found settings are set for the device to be used
 623  *      later in the device probing.
 624  *      Returns 0 if no settings found.
 625  */
 626 unsigned long netdev_boot_base(const char *prefix, int unit)
 627 {
 628         const struct netdev_boot_setup *s = dev_boot_setup;
 629         char name[IFNAMSIZ];
 630         int i;
 631
 632         sprintf(name, "%s%d", prefix, unit);
 633
 634         /*
 635          * If device already registered then return base of 1
 636          * to indicate not to probe for this interface
 637          */
 638         if (__dev_get_by_name(&init_net, name))
 639                 return 1;
 640
 641         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 642                 if (!strcmp(name, s[i].name))
 643                         return s[i].map.base_addr;
 644         return 0;
 645 }
 646
 647 /*
 648  * Saves at boot time configured settings for any netdevice.
 649  */
 650 int __init netdev_boot_setup(char *str)
 651 {
 652         int ints[5];
 653         struct ifmap map;
 654
 655         str = get_options(str, ARRAY_SIZE(ints), ints);
 656         if (!str || !*str)
 657                 return 0;
 658
 659         /* Save settings */
 660         memset(&map, 0, sizeof(map));
 661         if (ints[0] > 0)
 662                 map.irq = ints[1];
 663         if (ints[0] > 1)
 664                 map.base_addr = ints[2];
 665         if (ints[0] > 2)
 666                 map.mem_start = ints[3];
 667         if (ints[0] > 3)
 668                 map.mem_end = ints[4];
 669
 670         /* Add new entry to the list */
 671         return netdev_boot_setup_add(str, &map);
 672 }
 673
 674 __setup("netdev=", netdev_boot_setup);
 675
 676 /*******************************************************************************
 677
 678                             Device Interface Subroutines
 679
 680 *******************************************************************************/
 681
 682 /**
 683  *      __dev_get_by_name       - find a device by its name
 684  *      @net: the applicable net namespace
 685  *      @name: name to find
 686  *
 687  *      Find an interface by name. Must be called under RTNL semaphore
 688  *      or @dev_base_lock. If the name is found a pointer to the device
 689  *      is returned. If the name is not found then %NULL is returned. The
 690  *      reference counters are not incremented so the caller must be
 691  *      careful with locks.
 692  */
 693
 694 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 695 {
 696         struct hlist_node *p;
 697         struct net_device *dev;
 698         struct hlist_head *head = dev_name_hash(net, name);
 699
 700         hlist_for_each_entry(dev, p, head, name_hlist)
 701                 if (!strncmp(dev->name, name, IFNAMSIZ))
 702                         return dev;
 703
 704         return NULL;
 705 }
 706 EXPORT_SYMBOL(__dev_get_by_name);
 707
 708 /**
 709  *      dev_get_by_name_rcu     - find a device by its name
 710  *      @net: the applicable net namespace
 711  *      @name: name to find
 712  *
 713  *      Find an interface by name.
 714  *      If the name is found a pointer to the device is returned.
 715  *      If the name is not found then %NULL is returned.
 716  *      The reference counters are not incremented so the caller must be
 717  *      careful with locks. The caller must hold RCU lock.
 718  */
 719
 720 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 721 {
 722         struct hlist_node *p;
 723         struct net_device *dev;
 724         struct hlist_head *head = dev_name_hash(net, name);
 725
 726         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 727                 if (!strncmp(dev->name, name, IFNAMSIZ))
 728                         return dev;
 729
 730         return NULL;
 731 }
 732 EXPORT_SYMBOL(dev_get_by_name_rcu);
 733
 734 /**
 735  *      dev_get_by_name         - find a device by its name
 736  *      @net: the applicable net namespace
 737  *      @name: name to find
 738  *
 739  *      Find an interface by name. This can be called from any
 740  *      context and does its own locking. The returned handle has
 741  *      the usage count incremented and the caller must use dev_put() to
 742  *      release it when it is no longer needed. %NULL is returned if no
 743  *      matching device is found.
 744  */
 745
 746 struct net_device *dev_get_by_name(struct net *net, const char *name)
 747 {
 748         struct net_device *dev;
 749
 750         rcu_read_lock();
 751         dev = dev_get_by_name_rcu(net, name);
 752         if (dev)
 753                 dev_hold(dev);
 754         rcu_read_unlock();
 755         return dev;
 756 }
 757 EXPORT_SYMBOL(dev_get_by_name);
 758
 759 /**
 760  *      __dev_get_by_index - find a device by its ifindex
 761  *      @net: the applicable net namespace
 762  *      @ifindex: index of device
 763  *
 764  *      Search for an interface by index. Returns %NULL if the device
 765  *      is not found or a pointer to the device. The device has not
 766  *      had its reference counter increased so the caller must be careful
 767  *      about locking. The caller must hold either the RTNL semaphore
 768  *      or @dev_base_lock.
 769  */
 770
 771 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 772 {
 773         struct hlist_node *p;
 774         struct net_device *dev;
 775         struct hlist_head *head = dev_index_hash(net, ifindex);
 776
 777         hlist_for_each_entry(dev, p, head, index_hlist)
 778                 if (dev->ifindex == ifindex)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(__dev_get_by_index);
 784
 785 /**
 786  *      dev_get_by_index_rcu - find a device by its ifindex
 787  *      @net: the applicable net namespace
 788  *      @ifindex: index of device
 789  *
 790  *      Search for an interface by index. Returns %NULL if the device
 791  *      is not found or a pointer to the device. The device has not
 792  *      had its reference counter increased so the caller must be careful
 793  *      about locking. The caller must hold RCU lock.
 794  */
 795
 796 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 797 {
 798         struct hlist_node *p;
 799         struct net_device *dev;
 800         struct hlist_head *head = dev_index_hash(net, ifindex);
 801
 802         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 803                 if (dev->ifindex == ifindex)
 804                         return dev;
 805
 806         return NULL;
 807 }
 808 EXPORT_SYMBOL(dev_get_by_index_rcu);
 809
 810
 811 /**
 812  *      dev_get_by_index - find a device by its ifindex
 813  *      @net: the applicable net namespace
 814  *      @ifindex: index of device
 815  *
 816  *      Search for an interface by index. Returns NULL if the device
 817  *      is not found or a pointer to the device. The device returned has
 818  *      had a reference added and the pointer is safe until the user calls
 819  *      dev_put to indicate they have finished with it.
 820  */
 821
 822 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 823 {
 824         struct net_device *dev;
 825
 826         rcu_read_lock();
 827         dev = dev_get_by_index_rcu(net, ifindex);
 828         if (dev)
 829                 dev_hold(dev);
 830         rcu_read_unlock();
 831         return dev;
 832 }
 833 EXPORT_SYMBOL(dev_get_by_index);
 834
 835 /**
 836  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 837  *      @net: the applicable net namespace
 838  *      @type: media type of device
 839  *      @ha: hardware address
 840  *
 841  *      Search for an interface by MAC address. Returns NULL if the device
 842  *      is not found or a pointer to the device.
 843  *      The caller must hold RCU or RTNL.
 844  *      The returned device has not had its ref count increased
 845  *      and the caller must therefore be careful about locking
 846  *
 847  */
 848
 849 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 850                                        const char *ha)
 851 {
 852         struct net_device *dev;
 853
 854         for_each_netdev_rcu(net, dev)
 855                 if (dev->type == type &&
 856                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 857                         return dev;
 858
 859         return NULL;
 860 }
 861 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 862
 863 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 864 {
 865         struct net_device *dev;
 866
 867         ASSERT_RTNL();
 868         for_each_netdev(net, dev)
 869                 if (dev->type == type)
 870                         return dev;
 871
 872         return NULL;
 873 }
 874 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 875
 876 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 877 {
 878         struct net_device *dev, *ret = NULL;
 879
 880         rcu_read_lock();
 881         for_each_netdev_rcu(net, dev)
 882                 if (dev->type == type) {
 883                         dev_hold(dev);
 884                         ret = dev;
 885                         break;
 886                 }
 887         rcu_read_unlock();
 888         return ret;
 889 }
 890 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 891
 892 /**
 893  *      dev_get_by_flags_rcu - find any device with given flags
 894  *      @net: the applicable net namespace
 895  *      @if_flags: IFF_* values
 896  *      @mask: bitmask of bits in if_flags to check
 897  *
 898  *      Search for any interface with the given flags. Returns NULL if a device
 899  *      is not found or a pointer to the device. Must be called inside
 900  *      rcu_read_lock(), and result refcount is unchanged.
 901  */
 902
 903 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 904                                     unsigned short mask)
 905 {
 906         struct net_device *dev, *ret;
 907
 908         ret = NULL;
 909         for_each_netdev_rcu(net, dev) {
 910                 if (((dev->flags ^ if_flags) & mask) == 0) {
 911                         ret = dev;
 912                         break;
 913                 }
 914         }
 915         return ret;
 916 }
 917 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 918
 919 /**
 920  *      dev_valid_name - check if name is okay for network device
 921  *      @name: name string
 922  *
 923  *      Network device names need to be valid file names to
 924  *      to allow sysfs to work.  We also disallow any kind of
 925  *      whitespace.
 926  */
 927 bool dev_valid_name(const char *name)
 928 {
 929         if (*name == '\0')
 930                 return false;
 931         if (strlen(name) >= IFNAMSIZ)
 932                 return false;
 933         if (!strcmp(name, ".") || !strcmp(name, ".."))
 934                 return false;
 935
 936         while (*name) {
 937                 if (*name == '/' || isspace(*name))
 938                         return false;
 939                 name++;
 940         }
 941         return true;
 942 }
 943 EXPORT_SYMBOL(dev_valid_name);
 944
 945 /**
 946  *      __dev_alloc_name - allocate a name for a device
 947  *      @net: network namespace to allocate the device name in
 948  *      @name: name format string
 949  *      @buf:  scratch buffer and result name string
 950  *
 951  *      Passed a format string - eg "lt%d" it will try and find a suitable
 952  *      id. It scans list of devices to build up a free map, then chooses
 953  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 954  *      while allocating the name and adding the device in order to avoid
 955  *      duplicates.
 956  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 957  *      Returns the number of the unit assigned or a negative errno code.
 958  */
 959
 960 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 961 {
 962         int i = 0;
 963         const char *p;
 964         const int max_netdevices = 8*PAGE_SIZE;
 965         unsigned long *inuse;
 966         struct net_device *d;
 967
 968         p = strnchr(name, IFNAMSIZ-1, '%');
 969         if (p) {
 970                 /*
 971                  * Verify the string as this thing may have come from
 972                  * the user.  There must be either one "%d" and no other "%"
 973                  * characters.
 974                  */
 975                 if (p[1] != 'd' || strchr(p + 2, '%'))
 976                         return -EINVAL;
 977
 978                 /* Use one page as a bit array of possible slots */
 979                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 980                 if (!inuse)
 981                         return -ENOMEM;
 982
 983                 for_each_netdev(net, d) {
 984                         if (!sscanf(d->name, name, &i))
 985                                 continue;
 986                         if (i < 0 || i >= max_netdevices)
 987                                 continue;
 988
 989                         /*  avoid cases where sscanf is not exact inverse of printf */
 990                         snprintf(buf, IFNAMSIZ, name, i);
 991                         if (!strncmp(buf, d->name, IFNAMSIZ))
 992                                 set_bit(i, inuse);
 993                 }
 994
 995                 i = find_first_zero_bit(inuse, max_netdevices);
 996                 free_page((unsigned long) inuse);
 997         }
 998
 999         if (buf != name)
1000                 snprintf(buf, IFNAMSIZ, name, i);
1001         if (!__dev_get_by_name(net, buf))
1002                 return i;
1003
1004         /* It is possible to run out of possible slots
1005          * when the name is long and there isn't enough space left
1006          * for the digits, or if all bits are used.
1007          */
1008         return -ENFILE;
1009 }
1010
1011 /**
1012  *      dev_alloc_name - allocate a name for a device
1013  *      @dev: device
1014  *      @name: name format string
1015  *
1016  *      Passed a format string - eg "lt%d" it will try and find a suitable
1017  *      id. It scans list of devices to build up a free map, then chooses
1018  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1019  *      while allocating the name and adding the device in order to avoid
1020  *      duplicates.
1021  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1022  *      Returns the number of the unit assigned or a negative errno code.
1023  */
1024
1025 int dev_alloc_name(struct net_device *dev, const char *name)
1026 {
1027         char buf[IFNAMSIZ];
1028         struct net *net;
1029         int ret;
1030
1031         BUG_ON(!dev_net(dev));
1032         net = dev_net(dev);
1033         ret = __dev_alloc_name(net, name, buf);
1034         if (ret >= 0)
1035                 strlcpy(dev->name, buf, IFNAMSIZ);
1036         return ret;
1037 }
1038 EXPORT_SYMBOL(dev_alloc_name);
1039
1040 static int dev_alloc_name_ns(struct net *net,
1041                              struct net_device *dev,
1042                              const char *name)
1043 {
1044         char buf[IFNAMSIZ];
1045         int ret;
1046
1047         ret = __dev_alloc_name(net, name, buf);
1048         if (ret >= 0)
1049                 strlcpy(dev->name, buf, IFNAMSIZ);
1050         return ret;
1051 }
1052
1053 static int dev_get_valid_name(struct net *net,
1054                               struct net_device *dev,
1055                               const char *name)
1056 {
1057         BUG_ON(!net);
1058
1059         if (!dev_valid_name(name))
1060                 return -EINVAL;
1061
1062         if (strchr(name, '%'))
1063                 return dev_alloc_name_ns(net, dev, name);
1064         else if (__dev_get_by_name(net, name))
1065                 return -EEXIST;
1066         else if (dev->name != name)
1067                 strlcpy(dev->name, name, IFNAMSIZ);
1068
1069         return 0;
1070 }
1071
1072 /**
1073  *      dev_change_name - change name of a device
1074  *      @dev: device
1075  *      @newname: name (or format string) must be at least IFNAMSIZ
1076  *
1077  *      Change name of a device, can pass format strings "eth%d".
1078  *      for wildcarding.
1079  */
1080 int dev_change_name(struct net_device *dev, const char *newname)
1081 {
1082         char oldname[IFNAMSIZ];
1083         int err = 0;
1084         int ret;
1085         struct net *net;
1086
1087         ASSERT_RTNL();
1088         BUG_ON(!dev_net(dev));
1089
1090         net = dev_net(dev);
1091         if (dev->flags & IFF_UP)
1092                 return -EBUSY;
1093
1094         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1095                 return 0;
1096
1097         memcpy(oldname, dev->name, IFNAMSIZ);
1098
1099         err = dev_get_valid_name(net, dev, newname);
1100         if (err < 0)
1101                 return err;
1102
1103 rollback:
1104         ret = device_rename(&dev->dev, dev->name);
1105         if (ret) {
1106                 memcpy(dev->name, oldname, IFNAMSIZ);
1107                 return ret;
1108         }
1109
1110         write_lock_bh(&dev_base_lock);
1111         hlist_del_rcu(&dev->name_hlist);
1112         write_unlock_bh(&dev_base_lock);
1113
1114         synchronize_rcu();
1115
1116         write_lock_bh(&dev_base_lock);
1117         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1118         write_unlock_bh(&dev_base_lock);
1119
1120         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1121         ret = notifier_to_errno(ret);
1122
1123         if (ret) {
1124                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1125                 if (err >= 0) {
1126                         err = ret;
1127                         memcpy(dev->name, oldname, IFNAMSIZ);
1128                         goto rollback;
1129                 } else {
1130                         pr_err("%s: name change rollback failed: %d\n",
1131                                dev->name, ret);
1132                 }
1133         }
1134
1135         return err;
1136 }
1137
1138 /**
1139  *      dev_set_alias - change ifalias of a device
1140  *      @dev: device
1141  *      @alias: name up to IFALIASZ
1142  *      @len: limit of bytes to copy from info
1143  *
1144  *      Set ifalias for a device,
1145  */
1146 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1147 {
1148         char *new_ifalias;
1149
1150         ASSERT_RTNL();
1151
1152         if (len >= IFALIASZ)
1153                 return -EINVAL;
1154
1155         if (!len) {
1156                 if (dev->ifalias) {
1157                         kfree(dev->ifalias);
1158                         dev->ifalias = NULL;
1159                 }
1160                 return 0;
1161         }
1162
1163         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1164         if (!new_ifalias)
1165                 return -ENOMEM;
1166         dev->ifalias = new_ifalias;
1167
1168         strlcpy(dev->ifalias, alias, len+1);
1169         return len;
1170 }
1171
1172
1173 /**
1174  *      netdev_features_change - device changes features
1175  *      @dev: device to cause notification
1176  *
1177  *      Called to indicate a device has changed features.
1178  */
1179 void netdev_features_change(struct net_device *dev)
1180 {
1181         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1182 }
1183 EXPORT_SYMBOL(netdev_features_change);
1184
1185 /**
1186  *      netdev_state_change - device changes state
1187  *      @dev: device to cause notification
1188  *
1189  *      Called to indicate a device has changed state. This function calls
1190  *      the notifier chains for netdev_chain and sends a NEWLINK message
1191  *      to the routing socket.
1192  */
1193 void netdev_state_change(struct net_device *dev)
1194 {
1195         if (dev->flags & IFF_UP) {
1196                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1197                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1198         }
1199 }
1200 EXPORT_SYMBOL(netdev_state_change);
1201
1202 /**
1203  *      netdev_notify_peers - notify network peers about existence of @dev
1204  *      @dev: network device
1205  *
1206  * Generate traffic such that interested network peers are aware of
1207  * @dev, such as by generating a gratuitous ARP. This may be used when
1208  * a device wants to inform the rest of the network about some sort of
1209  * reconfiguration such as a failover event or virtual machine
1210  * migration.
1211  */
1212 void netdev_notify_peers(struct net_device *dev)
1213 {
1214         rtnl_lock();
1215         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1216         rtnl_unlock();
1217 }
1218 EXPORT_SYMBOL(netdev_notify_peers);
1219
1220 /**
1221  *      dev_load        - load a network module
1222  *      @net: the applicable net namespace
1223  *      @name: name of interface
1224  *
1225  *      If a network interface is not present and the process has suitable
1226  *      privileges this function loads the module. If module loading is not
1227  *      available in this kernel then it becomes a nop.
1228  */
1229
1230 void dev_load(struct net *net, const char *name)
1231 {
1232         struct net_device *dev;
1233         int no_module;
1234
1235         rcu_read_lock();
1236         dev = dev_get_by_name_rcu(net, name);
1237         rcu_read_unlock();
1238
1239         no_module = !dev;
1240         if (no_module && capable(CAP_NET_ADMIN))
1241                 no_module = request_module("netdev-%s", name);
1242         if (no_module && capable(CAP_SYS_MODULE)) {
1243                 if (!request_module("%s", name))
1244                         pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1245                                 name);
1246         }
1247 }
1248 EXPORT_SYMBOL(dev_load);
1249
1250 static int __dev_open(struct net_device *dev)
1251 {
1252         const struct net_device_ops *ops = dev->netdev_ops;
1253         int ret;
1254
1255         ASSERT_RTNL();
1256
1257         if (!netif_device_present(dev))
1258                 return -ENODEV;
1259
1260         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1261         ret = notifier_to_errno(ret);
1262         if (ret)
1263                 return ret;
1264
1265         set_bit(__LINK_STATE_START, &dev->state);
1266
1267         if (ops->ndo_validate_addr)
1268                 ret = ops->ndo_validate_addr(dev);
1269
1270         if (!ret && ops->ndo_open)
1271                 ret = ops->ndo_open(dev);
1272
1273         if (ret)
1274                 clear_bit(__LINK_STATE_START, &dev->state);
1275         else {
1276                 dev->flags |= IFF_UP;
1277                 net_dmaengine_get();
1278                 dev_set_rx_mode(dev);
1279                 dev_activate(dev);
1280                 add_device_randomness(dev->dev_addr, dev->addr_len);
1281         }
1282
1283         return ret;
1284 }
1285
1286 /**
1287  *      dev_open        - prepare an interface for use.
1288  *      @dev:   device to open
1289  *
1290  *      Takes a device from down to up state. The device's private open
1291  *      function is invoked and then the multicast lists are loaded. Finally
1292  *      the device is moved into the up state and a %NETDEV_UP message is
1293  *      sent to the netdev notifier chain.
1294  *
1295  *      Calling this function on an active interface is a nop. On a failure
1296  *      a negative errno code is returned.
1297  */
1298 int dev_open(struct net_device *dev)
1299 {
1300         int ret;
1301
1302         if (dev->flags & IFF_UP)
1303                 return 0;
1304
1305         ret = __dev_open(dev);
1306         if (ret < 0)
1307                 return ret;
1308
1309         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1310         call_netdevice_notifiers(NETDEV_UP, dev);
1311
1312         return ret;
1313 }
1314 EXPORT_SYMBOL(dev_open);
1315
1316 static int __dev_close_many(struct list_head *head)
1317 {
1318         struct net_device *dev;
1319
1320         ASSERT_RTNL();
1321         might_sleep();
1322
1323         list_for_each_entry(dev, head, unreg_list) {
1324                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1325
1326                 clear_bit(__LINK_STATE_START, &dev->state);
1327
1328                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1329                  * can be even on different cpu. So just clear netif_running().
1330                  *
1331                  * dev->stop() will invoke napi_disable() on all of it's
1332                  * napi_struct instances on this device.
1333                  */
1334                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1335         }
1336
1337         dev_deactivate_many(head);
1338
1339         list_for_each_entry(dev, head, unreg_list) {
1340                 const struct net_device_ops *ops = dev->netdev_ops;
1341
1342                 /*
1343                  *      Call the device specific close. This cannot fail.
1344                  *      Only if device is UP
1345                  *
1346                  *      We allow it to be called even after a DETACH hot-plug
1347                  *      event.
1348                  */
1349                 if (ops->ndo_stop)
1350                         ops->ndo_stop(dev);
1351
1352                 dev->flags &= ~IFF_UP;
1353                 net_dmaengine_put();
1354         }
1355
1356         return 0;
1357 }
1358
1359 static int __dev_close(struct net_device *dev)
1360 {
1361         int retval;
1362         LIST_HEAD(single);
1363
1364         list_add(&dev->unreg_list, &single);
1365         retval = __dev_close_many(&single);
1366         list_del(&single);
1367         return retval;
1368 }
1369
1370 static int dev_close_many(struct list_head *head)
1371 {
1372         struct net_device *dev, *tmp;
1373         LIST_HEAD(tmp_list);
1374
1375         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1376                 if (!(dev->flags & IFF_UP))
1377                         list_move(&dev->unreg_list, &tmp_list);
1378
1379         __dev_close_many(head);
1380
1381         list_for_each_entry(dev, head, unreg_list) {
1382                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1383                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1384         }
1385
1386         /* rollback_registered_many needs the complete original list */
1387         list_splice(&tmp_list, head);
1388         return 0;
1389 }
1390
1391 /**
1392  *      dev_close - shutdown an interface.
1393  *      @dev: device to shutdown
1394  *
1395  *      This function moves an active device into down state. A
1396  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1397  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1398  *      chain.
1399  */
1400 int dev_close(struct net_device *dev)
1401 {
1402         if (dev->flags & IFF_UP) {
1403                 LIST_HEAD(single);
1404
1405                 list_add(&dev->unreg_list, &single);
1406                 dev_close_many(&single);
1407                 list_del(&single);
1408         }
1409         return 0;
1410 }
1411 EXPORT_SYMBOL(dev_close);
1412
1413
1414 /**
1415  *      dev_disable_lro - disable Large Receive Offload on a device
1416  *      @dev: device
1417  *
1418  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1419  *      called under RTNL.  This is needed if received packets may be
1420  *      forwarded to another interface.
1421  */
1422 void dev_disable_lro(struct net_device *dev)
1423 {
1424         /*
1425          * If we're trying to disable lro on a vlan device
1426          * use the underlying physical device instead
1427          */
1428         if (is_vlan_dev(dev))
1429                 dev = vlan_dev_real_dev(dev);
1430
1431         dev->wanted_features &= ~NETIF_F_LRO;
1432         netdev_update_features(dev);
1433
1434         if (unlikely(dev->features & NETIF_F_LRO))
1435                 netdev_WARN(dev, "failed to disable LRO!\n");
1436 }
1437 EXPORT_SYMBOL(dev_disable_lro);
1438
1439
1440 static int dev_boot_phase = 1;
1441
1442 /**
1443  *      register_netdevice_notifier - register a network notifier block
1444  *      @nb: notifier
1445  *
1446  *      Register a notifier to be called when network device events occur.
1447  *      The notifier passed is linked into the kernel structures and must
1448  *      not be reused until it has been unregistered. A negative errno code
1449  *      is returned on a failure.
1450  *
1451  *      When registered all registration and up events are replayed
1452  *      to the new notifier to allow device to have a race free
1453  *      view of the network device list.
1454  */
1455
1456 int register_netdevice_notifier(struct notifier_block *nb)
1457 {
1458         struct net_device *dev;
1459         struct net_device *last;
1460         struct net *net;
1461         int err;
1462
1463         rtnl_lock();
1464         err = raw_notifier_chain_register(&netdev_chain, nb);
1465         if (err)
1466                 goto unlock;
1467         if (dev_boot_phase)
1468                 goto unlock;
1469         for_each_net(net) {
1470                 for_each_netdev(net, dev) {
1471                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1472                         err = notifier_to_errno(err);
1473                         if (err)
1474                                 goto rollback;
1475
1476                         if (!(dev->flags & IFF_UP))
1477                                 continue;
1478
1479                         nb->notifier_call(nb, NETDEV_UP, dev);
1480                 }
1481         }
1482
1483 unlock:
1484         rtnl_unlock();
1485         return err;
1486
1487 rollback:
1488         last = dev;
1489         for_each_net(net) {
1490                 for_each_netdev(net, dev) {
1491                         if (dev == last)
1492                                 goto outroll;
1493
1494                         if (dev->flags & IFF_UP) {
1495                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1496                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1497                         }
1498                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1499                 }
1500         }
1501
1502 outroll:
1503         raw_notifier_chain_unregister(&netdev_chain, nb);
1504         goto unlock;
1505 }
1506 EXPORT_SYMBOL(register_netdevice_notifier);
1507
1508 /**
1509  *      unregister_netdevice_notifier - unregister a network notifier block
1510  *      @nb: notifier
1511  *
1512  *      Unregister a notifier previously registered by
1513  *      register_netdevice_notifier(). The notifier is unlinked into the
1514  *      kernel structures and may then be reused. A negative errno code
1515  *      is returned on a failure.
1516  *
1517  *      After unregistering unregister and down device events are synthesized
1518  *      for all devices on the device list to the removed notifier to remove
1519  *      the need for special case cleanup code.
1520  */
1521
1522 int unregister_netdevice_notifier(struct notifier_block *nb)
1523 {
1524         struct net_device *dev;
1525         struct net *net;
1526         int err;
1527
1528         rtnl_lock();
1529         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1530         if (err)
1531                 goto unlock;
1532
1533         for_each_net(net) {
1534                 for_each_netdev(net, dev) {
1535                         if (dev->flags & IFF_UP) {
1536                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1537                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1538                         }
1539                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1540                 }
1541         }
1542 unlock:
1543         rtnl_unlock();
1544         return err;
1545 }
1546 EXPORT_SYMBOL(unregister_netdevice_notifier);
1547
1548 /**
1549  *      call_netdevice_notifiers - call all network notifier blocks
1550  *      @val: value passed unmodified to notifier function
1551  *      @dev: net_device pointer passed unmodified to notifier function
1552  *
1553  *      Call all network notifier blocks.  Parameters and return value
1554  *      are as for raw_notifier_call_chain().
1555  */
1556
1557 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1558 {
1559         ASSERT_RTNL();
1560         return raw_notifier_call_chain(&netdev_chain, val, dev);
1561 }
1562 EXPORT_SYMBOL(call_netdevice_notifiers);
1563
1564 static struct static_key netstamp_needed __read_mostly;
1565 #ifdef HAVE_JUMP_LABEL
1566 /* We are not allowed to call static_key_slow_dec() from irq context
1567  * If net_disable_timestamp() is called from irq context, defer the
1568  * static_key_slow_dec() calls.
1569  */
1570 static atomic_t netstamp_needed_deferred;
1571 #endif
1572
1573 void net_enable_timestamp(void)
1574 {
1575 #ifdef HAVE_JUMP_LABEL
1576         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1577
1578         if (deferred) {
1579                 while (--deferred)
1580                         static_key_slow_dec(&netstamp_needed);
1581                 return;
1582         }
1583 #endif
1584         WARN_ON(in_interrupt());
1585         static_key_slow_inc(&netstamp_needed);
1586 }
1587 EXPORT_SYMBOL(net_enable_timestamp);
1588
1589 void net_disable_timestamp(void)
1590 {
1591 #ifdef HAVE_JUMP_LABEL
1592         if (in_interrupt()) {
1593                 atomic_inc(&netstamp_needed_deferred);
1594                 return;
1595         }
1596 #endif
1597         static_key_slow_dec(&netstamp_needed);
1598 }
1599 EXPORT_SYMBOL(net_disable_timestamp);
1600
1601 static inline void net_timestamp_set(struct sk_buff *skb)
1602 {
1603         skb->tstamp.tv64 = 0;
1604         if (static_key_false(&netstamp_needed))
1605                 __net_timestamp(skb);
1606 }
1607
1608 #define net_timestamp_check(COND, SKB)                  \
1609         if (static_key_false(&netstamp_needed)) {               \
1610                 if ((COND) && !(SKB)->tstamp.tv64)      \
1611                         __net_timestamp(SKB);           \
1612         }                                               \
1613
1614 static int net_hwtstamp_validate(struct ifreq *ifr)
1615 {
1616         struct hwtstamp_config cfg;
1617         enum hwtstamp_tx_types tx_type;
1618         enum hwtstamp_rx_filters rx_filter;
1619         int tx_type_valid = 0;
1620         int rx_filter_valid = 0;
1621
1622         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1623                 return -EFAULT;
1624
1625         if (cfg.flags) /* reserved for future extensions */
1626                 return -EINVAL;
1627
1628         tx_type = cfg.tx_type;
1629         rx_filter = cfg.rx_filter;
1630
1631         switch (tx_type) {
1632         case HWTSTAMP_TX_OFF:
1633         case HWTSTAMP_TX_ON:
1634         case HWTSTAMP_TX_ONESTEP_SYNC:
1635                 tx_type_valid = 1;
1636                 break;
1637         }
1638
1639         switch (rx_filter) {
1640         case HWTSTAMP_FILTER_NONE:
1641         case HWTSTAMP_FILTER_ALL:
1642         case HWTSTAMP_FILTER_SOME:
1643         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1644         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1645         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1646         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1647         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1648         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1649         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1650         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1651         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1652         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1653         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1654         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1655                 rx_filter_valid = 1;
1656                 break;
1657         }
1658
1659         if (!tx_type_valid || !rx_filter_valid)
1660                 return -ERANGE;
1661
1662         return 0;
1663 }
1664
1665 static inline bool is_skb_forwardable(struct net_device *dev,
1666                                       struct sk_buff *skb)
1667 {
1668         unsigned int len;
1669
1670         if (!(dev->flags & IFF_UP))
1671                 return false;
1672
1673         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1674         if (skb->len <= len)
1675                 return true;
1676
1677         /* if TSO is enabled, we don't care about the length as the packet
1678          * could be forwarded without being segmented before
1679          */
1680         if (skb_is_gso(skb))
1681                 return true;
1682
1683         return false;
1684 }
1685
1686 /**
1687  * dev_forward_skb - loopback an skb to another netif
1688  *
1689  * @dev: destination network device
1690  * @skb: buffer to forward
1691  *
1692  * return values:
1693  *      NET_RX_SUCCESS  (no congestion)
1694  *      NET_RX_DROP     (packet was dropped, but freed)
1695  *
1696  * dev_forward_skb can be used for injecting an skb from the
1697  * start_xmit function of one device into the receive queue
1698  * of another device.
1699  *
1700  * The receiving device may be in another namespace, so
1701  * we have to clear all information in the skb that could
1702  * impact namespace isolation.
1703  */
1704 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1705 {
1706         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1707                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1708                         atomic_long_inc(&dev->rx_dropped);
1709                         kfree_skb(skb);
1710                         return NET_RX_DROP;
1711                 }
1712         }
1713
1714         skb_orphan(skb);
1715         nf_reset(skb);
1716
1717         if (unlikely(!is_skb_forwardable(dev, skb))) {
1718                 atomic_long_inc(&dev->rx_dropped);
1719                 kfree_skb(skb);
1720                 return NET_RX_DROP;
1721         }
1722         skb->skb_iif = 0;
1723         skb->dev = dev;
1724         skb_dst_drop(skb);
1725         skb->tstamp.tv64 = 0;
1726         skb->pkt_type = PACKET_HOST;
1727         skb->protocol = eth_type_trans(skb, dev);
1728         skb->mark = 0;
1729         secpath_reset(skb);
1730         nf_reset(skb);
1731         return netif_rx(skb);
1732 }
1733 EXPORT_SYMBOL_GPL(dev_forward_skb);
1734
1735 static inline int deliver_skb(struct sk_buff *skb,
1736                               struct packet_type *pt_prev,
1737                               struct net_device *orig_dev)
1738 {
1739         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1740                 return -ENOMEM;
1741         atomic_inc(&skb->users);
1742         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1743 }
1744
1745 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1746 {
1747         if (!ptype->af_packet_priv || !skb->sk)
1748                 return false;
1749
1750         if (ptype->id_match)
1751                 return ptype->id_match(ptype, skb->sk);
1752         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1753                 return true;
1754
1755         return false;
1756 }
1757
1758 /*
1759  *      Support routine. Sends outgoing frames to any network
1760  *      taps currently in use.
1761  */
1762
1763 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1764 {
1765         struct packet_type *ptype;
1766         struct sk_buff *skb2 = NULL;
1767         struct packet_type *pt_prev = NULL;
1768
1769         rcu_read_lock();
1770         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1771                 /* Never send packets back to the socket
1772                  * they originated from - MvS (miquels@drinkel.ow.org)
1773                  */
1774                 if ((ptype->dev == dev || !ptype->dev) &&
1775                     (!skb_loop_sk(ptype, skb))) {
1776                         if (pt_prev) {
1777                                 deliver_skb(skb2, pt_prev, skb->dev);
1778                                 pt_prev = ptype;
1779                                 continue;
1780                         }
1781
1782                         skb2 = skb_clone(skb, GFP_ATOMIC);
1783                         if (!skb2)
1784                                 break;
1785
1786                         net_timestamp_set(skb2);
1787
1788                         /* skb->nh should be correctly
1789                            set by sender, so that the second statement is
1790                            just protection against buggy protocols.
1791                          */
1792                         skb_reset_mac_header(skb2);
1793
1794                         if (skb_network_header(skb2) < skb2->data ||
1795                             skb2->network_header > skb2->tail) {
1796                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1797                                                      ntohs(skb2->protocol),
1798                                                      dev->name);
1799                                 skb_reset_network_header(skb2);
1800                         }
1801
1802                         skb2->transport_header = skb2->network_header;
1803                         skb2->pkt_type = PACKET_OUTGOING;
1804                         pt_prev = ptype;
1805                 }
1806         }
1807         if (pt_prev)
1808                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1809         rcu_read_unlock();
1810 }
1811
1812 /**
1813  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1814  * @dev: Network device
1815  * @txq: number of queues available
1816  *
1817  * If real_num_tx_queues is changed the tc mappings may no longer be
1818  * valid. To resolve this verify the tc mapping remains valid and if
1819  * not NULL the mapping. With no priorities mapping to this
1820  * offset/count pair it will no longer be used. In the worst case TC0
1821  * is invalid nothing can be done so disable priority mappings. If is
1822  * expected that drivers will fix this mapping if they can before
1823  * calling netif_set_real_num_tx_queues.
1824  */
1825 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1826 {
1827         int i;
1828         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1829
1830         /* If TC0 is invalidated disable TC mapping */
1831         if (tc->offset + tc->count > txq) {
1832                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1833                 dev->num_tc = 0;
1834                 return;
1835         }
1836
1837         /* Invalidated prio to tc mappings set to TC0 */
1838         for (i = 1; i < TC_BITMASK + 1; i++) {
1839                 int q = netdev_get_prio_tc_map(dev, i);
1840
1841                 tc = &dev->tc_to_txq[q];
1842                 if (tc->offset + tc->count > txq) {
1843                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1844                                 i, q);
1845                         netdev_set_prio_tc_map(dev, i, 0);
1846                 }
1847         }
1848 }
1849
1850 /*
1851  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1852  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1853  */
1854 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1855 {
1856         int rc;
1857
1858         if (txq < 1 || txq > dev->num_tx_queues)
1859                 return -EINVAL;
1860
1861         if (dev->reg_state == NETREG_REGISTERED ||
1862             dev->reg_state == NETREG_UNREGISTERING) {
1863                 ASSERT_RTNL();
1864
1865                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1866                                                   txq);
1867                 if (rc)
1868                         return rc;
1869
1870                 if (dev->num_tc)
1871                         netif_setup_tc(dev, txq);
1872
1873                 if (txq < dev->real_num_tx_queues)
1874                         qdisc_reset_all_tx_gt(dev, txq);
1875         }
1876
1877         dev->real_num_tx_queues = txq;
1878         return 0;
1879 }
1880 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1881
1882 #ifdef CONFIG_RPS
1883 /**
1884  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1885  *      @dev: Network device
1886  *      @rxq: Actual number of RX queues
1887  *
1888  *      This must be called either with the rtnl_lock held or before
1889  *      registration of the net device.  Returns 0 on success, or a
1890  *      negative error code.  If called before registration, it always
1891  *      succeeds.
1892  */
1893 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1894 {
1895         int rc;
1896
1897         if (rxq < 1 || rxq > dev->num_rx_queues)
1898                 return -EINVAL;
1899
1900         if (dev->reg_state == NETREG_REGISTERED) {
1901                 ASSERT_RTNL();
1902
1903                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1904                                                   rxq);
1905                 if (rc)
1906                         return rc;
1907         }
1908
1909         dev->real_num_rx_queues = rxq;
1910         return 0;
1911 }
1912 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1913 #endif
1914
1915 /**
1916  * netif_get_num_default_rss_queues - default number of RSS queues
1917  *
1918  * This routine should set an upper limit on the number of RSS queues
1919  * used by default by multiqueue devices.
1920  */
1921 int netif_get_num_default_rss_queues(void)
1922 {
1923         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
1924 }
1925 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
1926
1927 static inline void __netif_reschedule(struct Qdisc *q)
1928 {
1929         struct softnet_data *sd;
1930         unsigned long flags;
1931
1932         local_irq_save(flags);
1933         sd = &__get_cpu_var(softnet_data);
1934         q->next_sched = NULL;
1935         *sd->output_queue_tailp = q;
1936         sd->output_queue_tailp = &q->next_sched;
1937         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1938         local_irq_restore(flags);
1939 }
1940
1941 void __netif_schedule(struct Qdisc *q)
1942 {
1943         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1944                 __netif_reschedule(q);
1945 }
1946 EXPORT_SYMBOL(__netif_schedule);
1947
1948 void dev_kfree_skb_irq(struct sk_buff *skb)
1949 {
1950         if (atomic_dec_and_test(&skb->users)) {
1951                 struct softnet_data *sd;
1952                 unsigned long flags;
1953
1954                 local_irq_save(flags);
1955                 sd = &__get_cpu_var(softnet_data);
1956                 skb->next = sd->completion_queue;
1957                 sd->completion_queue = skb;
1958                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1959                 local_irq_restore(flags);
1960         }
1961 }
1962 EXPORT_SYMBOL(dev_kfree_skb_irq);
1963
1964 void dev_kfree_skb_any(struct sk_buff *skb)
1965 {
1966         if (in_irq() || irqs_disabled())
1967                 dev_kfree_skb_irq(skb);
1968         else
1969                 dev_kfree_skb(skb);
1970 }
1971 EXPORT_SYMBOL(dev_kfree_skb_any);
1972
1973
1974 /**
1975  * netif_device_detach - mark device as removed
1976  * @dev: network device
1977  *
1978  * Mark device as removed from system and therefore no longer available.
1979  */
1980 void netif_device_detach(struct net_device *dev)
1981 {
1982         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1983             netif_running(dev)) {
1984                 netif_tx_stop_all_queues(dev);
1985         }
1986 }
1987 EXPORT_SYMBOL(netif_device_detach);
1988
1989 /**
1990  * netif_device_attach - mark device as attached
1991  * @dev: network device
1992  *
1993  * Mark device as attached from system and restart if needed.
1994  */
1995 void netif_device_attach(struct net_device *dev)
1996 {
1997         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1998             netif_running(dev)) {
1999                 netif_tx_wake_all_queues(dev);
2000                 __netdev_watchdog_up(dev);
2001         }
2002 }
2003 EXPORT_SYMBOL(netif_device_attach);
2004
2005 static void skb_warn_bad_offload(const struct sk_buff *skb)
2006 {
2007         static const netdev_features_t null_features = 0;
2008         struct net_device *dev = skb->dev;
2009         const char *driver = "";
2010
2011         if (dev && dev->dev.parent)
2012                 driver = dev_driver_string(dev->dev.parent);
2013
2014         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2015              "gso_type=%d ip_summed=%d\n",
2016              driver, dev ? &dev->features : &null_features,
2017              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2018              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2019              skb_shinfo(skb)->gso_type, skb->ip_summed);
2020 }
2021
2022 /*
2023  * Invalidate hardware checksum when packet is to be mangled, and
2024  * complete checksum manually on outgoing path.
2025  */
2026 int skb_checksum_help(struct sk_buff *skb)
2027 {
2028         __wsum csum;
2029         int ret = 0, offset;
2030
2031         if (skb->ip_summed == CHECKSUM_COMPLETE)
2032                 goto out_set_summed;
2033
2034         if (unlikely(skb_shinfo(skb)->gso_size)) {
2035                 skb_warn_bad_offload(skb);
2036                 return -EINVAL;
2037         }
2038
2039         offset = skb_checksum_start_offset(skb);
2040         BUG_ON(offset >= skb_headlen(skb));
2041         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2042
2043         offset += skb->csum_offset;
2044         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2045
2046         if (skb_cloned(skb) &&
2047             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2048                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2049                 if (ret)
2050                         goto out;
2051         }
2052
2053         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2054 out_set_summed:
2055         skb->ip_summed = CHECKSUM_NONE;
2056 out:
2057         return ret;
2058 }
2059 EXPORT_SYMBOL(skb_checksum_help);
2060
2061 /**
2062  *      skb_gso_segment - Perform segmentation on skb.
2063  *      @skb: buffer to segment
2064  *      @features: features for the output path (see dev->features)
2065  *
2066  *      This function segments the given skb and returns a list of segments.
2067  *
2068  *      It may return NULL if the skb requires no segmentation.  This is
2069  *      only possible when GSO is used for verifying header integrity.
2070  */
2071 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
2072         netdev_features_t features)
2073 {
2074         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2075         struct packet_offload *ptype;
2076         __be16 type = skb->protocol;
2077         int vlan_depth = ETH_HLEN;
2078         int err;
2079
2080         while (type == htons(ETH_P_8021Q)) {
2081                 struct vlan_hdr *vh;
2082
2083                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2084                         return ERR_PTR(-EINVAL);
2085
2086                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2087                 type = vh->h_vlan_encapsulated_proto;
2088                 vlan_depth += VLAN_HLEN;
2089         }
2090
2091         skb_reset_mac_header(skb);
2092         skb->mac_len = skb->network_header - skb->mac_header;
2093         __skb_pull(skb, skb->mac_len);
2094
2095         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2096                 skb_warn_bad_offload(skb);
2097
2098                 if (skb_header_cloned(skb) &&
2099                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2100                         return ERR_PTR(err);
2101         }
2102
2103         rcu_read_lock();
2104         list_for_each_entry_rcu(ptype, &offload_base, list) {
2105                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2106                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2107                                 err = ptype->callbacks.gso_send_check(skb);
2108                                 segs = ERR_PTR(err);
2109                                 if (err || skb_gso_ok(skb, features))
2110                                         break;
2111                                 __skb_push(skb, (skb->data -
2112                                                  skb_network_header(skb)));
2113                         }
2114                         segs = ptype->callbacks.gso_segment(skb, features);
2115                         break;
2116                 }
2117         }
2118         rcu_read_unlock();
2119
2120         __skb_push(skb, skb->data - skb_mac_header(skb));
2121
2122         return segs;
2123 }
2124 EXPORT_SYMBOL(skb_gso_segment);
2125
2126 /* Take action when hardware reception checksum errors are detected. */
2127 #ifdef CONFIG_BUG
2128 void netdev_rx_csum_fault(struct net_device *dev)
2129 {
2130         if (net_ratelimit()) {
2131                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2132                 dump_stack();
2133         }
2134 }
2135 EXPORT_SYMBOL(netdev_rx_csum_fault);
2136 #endif
2137
2138 /* Actually, we should eliminate this check as soon as we know, that:
2139  * 1. IOMMU is present and allows to map all the memory.
2140  * 2. No high memory really exists on this machine.
2141  */
2142
2143 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2144 {
2145 #ifdef CONFIG_HIGHMEM
2146         int i;
2147         if (!(dev->features & NETIF_F_HIGHDMA)) {
2148                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2149                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2150                         if (PageHighMem(skb_frag_page(frag)))
2151                                 return 1;
2152                 }
2153         }
2154
2155         if (PCI_DMA_BUS_IS_PHYS) {
2156                 struct device *pdev = dev->dev.parent;
2157
2158                 if (!pdev)
2159                         return 0;
2160                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2161                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2162                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2163                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2164                                 return 1;
2165                 }
2166         }
2167 #endif
2168         return 0;
2169 }
2170
2171 struct dev_gso_cb {
2172         void (*destructor)(struct sk_buff *skb);
2173 };
2174
2175 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2176
2177 static void dev_gso_skb_destructor(struct sk_buff *skb)
2178 {
2179         struct dev_gso_cb *cb;
2180
2181         do {
2182                 struct sk_buff *nskb = skb->next;
2183
2184                 skb->next = nskb->next;
2185                 nskb->next = NULL;
2186                 kfree_skb(nskb);
2187         } while (skb->next);
2188
2189         cb = DEV_GSO_CB(skb);
2190         if (cb->destructor)
2191                 cb->destructor(skb);
2192 }
2193
2194 /**
2195  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2196  *      @skb: buffer to segment
2197  *      @features: device features as applicable to this skb
2198  *
2199  *      This function segments the given skb and stores the list of segments
2200  *      in skb->next.
2201  */
2202 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2203 {
2204         struct sk_buff *segs;
2205
2206         segs = skb_gso_segment(skb, features);
2207
2208         /* Verifying header integrity only. */
2209         if (!segs)
2210                 return 0;
2211
2212         if (IS_ERR(segs))
2213                 return PTR_ERR(segs);
2214
2215         skb->next = segs;
2216         DEV_GSO_CB(skb)->destructor = skb->destructor;
2217         skb->destructor = dev_gso_skb_destructor;
2218
2219         return 0;
2220 }
2221
2222 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2223 {
2224         return ((features & NETIF_F_GEN_CSUM) ||
2225                 ((features & NETIF_F_V4_CSUM) &&
2226                  protocol == htons(ETH_P_IP)) ||
2227                 ((features & NETIF_F_V6_CSUM) &&
2228                  protocol == htons(ETH_P_IPV6)) ||
2229                 ((features & NETIF_F_FCOE_CRC) &&
2230                  protocol == htons(ETH_P_FCOE)));
2231 }
2232
2233 static netdev_features_t harmonize_features(struct sk_buff *skb,
2234         __be16 protocol, netdev_features_t features)
2235 {
2236         if (skb->ip_summed != CHECKSUM_NONE &&
2237             !can_checksum_protocol(features, protocol)) {
2238                 features &= ~NETIF_F_ALL_CSUM;
2239                 features &= ~NETIF_F_SG;
2240         } else if (illegal_highdma(skb->dev, skb)) {
2241                 features &= ~NETIF_F_SG;
2242         }
2243
2244         return features;
2245 }
2246
2247 netdev_features_t netif_skb_features(struct sk_buff *skb)
2248 {
2249         __be16 protocol = skb->protocol;
2250         netdev_features_t features = skb->dev->features;
2251
2252         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2253                 features &= ~NETIF_F_GSO_MASK;
2254
2255         if (protocol == htons(ETH_P_8021Q)) {
2256                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2257                 protocol = veh->h_vlan_encapsulated_proto;
2258         } else if (!vlan_tx_tag_present(skb)) {
2259                 return harmonize_features(skb, protocol, features);
2260         }
2261
2262         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2263
2264         if (protocol != htons(ETH_P_8021Q)) {
2265                 return harmonize_features(skb, protocol, features);
2266         } else {
2267                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2268                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2269                 return harmonize_features(skb, protocol, features);
2270         }
2271 }
2272 EXPORT_SYMBOL(netif_skb_features);
2273
2274 /*
2275  * Returns true if either:
2276  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2277  *      2. skb is fragmented and the device does not support SG.
2278  */
2279 static inline int skb_needs_linearize(struct sk_buff *skb,
2280                                       int features)
2281 {
2282         return skb_is_nonlinear(skb) &&
2283                         ((skb_has_frag_list(skb) &&
2284                                 !(features & NETIF_F_FRAGLIST)) ||
2285                         (skb_shinfo(skb)->nr_frags &&
2286                                 !(features & NETIF_F_SG)));
2287 }
2288
2289 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2290                         struct netdev_queue *txq)
2291 {
2292         const struct net_device_ops *ops = dev->netdev_ops;
2293         int rc = NETDEV_TX_OK;
2294         unsigned int skb_len;
2295
2296         if (likely(!skb->next)) {
2297                 netdev_features_t features;
2298
2299                 /*
2300                  * If device doesn't need skb->dst, release it right now while
2301                  * its hot in this cpu cache
2302                  */
2303                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2304                         skb_dst_drop(skb);
2305
2306                 features = netif_skb_features(skb);
2307
2308                 if (vlan_tx_tag_present(skb) &&
2309                     !(features & NETIF_F_HW_VLAN_TX)) {
2310                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2311                         if (unlikely(!skb))
2312                                 goto out;
2313
2314                         skb->vlan_tci = 0;
2315                 }
2316
2317                 if (netif_needs_gso(skb, features)) {
2318                         if (unlikely(dev_gso_segment(skb, features)))
2319                                 goto out_kfree_skb;
2320                         if (skb->next)
2321                                 goto gso;
2322                 } else {
2323                         if (skb_needs_linearize(skb, features) &&
2324                             __skb_linearize(skb))
2325                                 goto out_kfree_skb;
2326
2327                         /* If packet is not checksummed and device does not
2328                          * support checksumming for this protocol, complete
2329                          * checksumming here.
2330                          */
2331                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2332                                 skb_set_transport_header(skb,
2333                                         skb_checksum_start_offset(skb));
2334                                 if (!(features & NETIF_F_ALL_CSUM) &&
2335                                      skb_checksum_help(skb))
2336                                         goto out_kfree_skb;
2337                         }
2338                 }
2339
2340                 if (!list_empty(&ptype_all))
2341                         dev_queue_xmit_nit(skb, dev);
2342
2343                 skb_len = skb->len;
2344                 rc = ops->ndo_start_xmit(skb, dev);
2345                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2346                 if (rc == NETDEV_TX_OK)
2347                         txq_trans_update(txq);
2348                 return rc;
2349         }
2350
2351 gso:
2352         do {
2353                 struct sk_buff *nskb = skb->next;
2354
2355                 skb->next = nskb->next;
2356                 nskb->next = NULL;
2357
2358                 /*
2359                  * If device doesn't need nskb->dst, release it right now while
2360                  * its hot in this cpu cache
2361                  */
2362                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2363                         skb_dst_drop(nskb);
2364
2365                 if (!list_empty(&ptype_all))
2366                         dev_queue_xmit_nit(nskb, dev);
2367
2368                 skb_len = nskb->len;
2369                 rc = ops->ndo_start_xmit(nskb, dev);
2370                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2371                 if (unlikely(rc != NETDEV_TX_OK)) {
2372                         if (rc & ~NETDEV_TX_MASK)
2373                                 goto out_kfree_gso_skb;
2374                         nskb->next = skb->next;
2375                         skb->next = nskb;
2376                         return rc;
2377                 }
2378                 txq_trans_update(txq);
2379                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2380                         return NETDEV_TX_BUSY;
2381         } while (skb->next);
2382
2383 out_kfree_gso_skb:
2384         if (likely(skb->next == NULL))
2385                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2386 out_kfree_skb:
2387         kfree_skb(skb);
2388 out:
2389         return rc;
2390 }
2391
2392 static u32 hashrnd __read_mostly;
2393
2394 /*
2395  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2396  * to be used as a distribution range.
2397  */
2398 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2399                   unsigned int num_tx_queues)
2400 {
2401         u32 hash;
2402         u16 qoffset = 0;
2403         u16 qcount = num_tx_queues;
2404
2405         if (skb_rx_queue_recorded(skb)) {
2406                 hash = skb_get_rx_queue(skb);
2407                 while (unlikely(hash >= num_tx_queues))
2408                         hash -= num_tx_queues;
2409                 return hash;
2410         }
2411
2412         if (dev->num_tc) {
2413                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2414                 qoffset = dev->tc_to_txq[tc].offset;
2415                 qcount = dev->tc_to_txq[tc].count;
2416         }
2417
2418         if (skb->sk && skb->sk->sk_hash)
2419                 hash = skb->sk->sk_hash;
2420         else
2421                 hash = (__force u16) skb->protocol;
2422         hash = jhash_1word(hash, hashrnd);
2423
2424         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2425 }
2426 EXPORT_SYMBOL(__skb_tx_hash);
2427
2428 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2429 {
2430         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2431                 net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2432                                      dev->name, queue_index,
2433                                      dev->real_num_tx_queues);
2434                 return 0;
2435         }
2436         return queue_index;
2437 }
2438
2439 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2440 {
2441 #ifdef CONFIG_XPS
2442         struct xps_dev_maps *dev_maps;
2443         struct xps_map *map;
2444         int queue_index = -1;
2445
2446         rcu_read_lock();
2447         dev_maps = rcu_dereference(dev->xps_maps);
2448         if (dev_maps) {
2449                 map = rcu_dereference(
2450                     dev_maps->cpu_map[raw_smp_processor_id()]);
2451                 if (map) {
2452                         if (map->len == 1)
2453                                 queue_index = map->queues[0];
2454                         else {
2455                                 u32 hash;
2456                                 if (skb->sk && skb->sk->sk_hash)
2457                                         hash = skb->sk->sk_hash;
2458                                 else
2459                                         hash = (__force u16) skb->protocol ^
2460                                             skb->rxhash;
2461                                 hash = jhash_1word(hash, hashrnd);
2462                                 queue_index = map->queues[
2463                                     ((u64)hash * map->len) >> 32];
2464                         }
2465                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2466                                 queue_index = -1;
2467                 }
2468         }
2469         rcu_read_unlock();
2470
2471         return queue_index;
2472 #else
2473         return -1;
2474 #endif
2475 }
2476
2477 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2478                                     struct sk_buff *skb)
2479 {
2480         int queue_index;
2481         const struct net_device_ops *ops = dev->netdev_ops;
2482
2483         if (dev->real_num_tx_queues == 1)
2484                 queue_index = 0;
2485         else if (ops->ndo_select_queue) {
2486                 queue_index = ops->ndo_select_queue(dev, skb);
2487                 queue_index = dev_cap_txqueue(dev, queue_index);
2488         } else {
2489                 struct sock *sk = skb->sk;
2490                 queue_index = sk_tx_queue_get(sk);
2491
2492                 if (queue_index < 0 || skb->ooo_okay ||
2493                     queue_index >= dev->real_num_tx_queues) {
2494                         int old_index = queue_index;
2495
2496                         queue_index = get_xps_queue(dev, skb);
2497                         if (queue_index < 0)
2498                                 queue_index = skb_tx_hash(dev, skb);
2499
2500                         if (queue_index != old_index && sk) {
2501                                 struct dst_entry *dst =
2502                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2503
2504                                 if (dst && skb_dst(skb) == dst)
2505                                         sk_tx_queue_set(sk, queue_index);
2506                         }
2507                 }
2508         }
2509
2510         skb_set_queue_mapping(skb, queue_index);
2511         return netdev_get_tx_queue(dev, queue_index);
2512 }
2513
2514 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2515                                  struct net_device *dev,
2516                                  struct netdev_queue *txq)
2517 {
2518         spinlock_t *root_lock = qdisc_lock(q);
2519         bool contended;
2520         int rc;
2521
2522         qdisc_skb_cb(skb)->pkt_len = skb->len;
2523         qdisc_calculate_pkt_len(skb, q);
2524         /*
2525          * Heuristic to force contended enqueues to serialize on a
2526          * separate lock before trying to get qdisc main lock.
2527          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2528          * and dequeue packets faster.
2529          */
2530         contended = qdisc_is_running(q);
2531         if (unlikely(contended))
2532                 spin_lock(&q->busylock);
2533
2534         spin_lock(root_lock);
2535         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2536                 kfree_skb(skb);
2537                 rc = NET_XMIT_DROP;
2538         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2539                    qdisc_run_begin(q)) {
2540                 /*
2541                  * This is a work-conserving queue; there are no old skbs
2542                  * waiting to be sent out; and the qdisc is not running -
2543                  * xmit the skb directly.
2544                  */
2545                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2546                         skb_dst_force(skb);
2547
2548                 qdisc_bstats_update(q, skb);
2549
2550                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2551                         if (unlikely(contended)) {
2552                                 spin_unlock(&q->busylock);
2553                                 contended = false;
2554                         }
2555                         __qdisc_run(q);
2556                 } else
2557                         qdisc_run_end(q);
2558
2559                 rc = NET_XMIT_SUCCESS;
2560         } else {
2561                 skb_dst_force(skb);
2562                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2563                 if (qdisc_run_begin(q)) {
2564                         if (unlikely(contended)) {
2565                                 spin_unlock(&q->busylock);
2566                                 contended = false;
2567                         }
2568                         __qdisc_run(q);
2569                 }
2570         }
2571         spin_unlock(root_lock);
2572         if (unlikely(contended))
2573                 spin_unlock(&q->busylock);
2574         return rc;
2575 }
2576
2577 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2578 static void skb_update_prio(struct sk_buff *skb)
2579 {
2580         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2581
2582         if (!skb->priority && skb->sk && map) {
2583                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2584
2585                 if (prioidx < map->priomap_len)
2586                         skb->priority = map->priomap[prioidx];
2587         }
2588 }
2589 #else
2590 #define skb_update_prio(skb)
2591 #endif
2592
2593 static DEFINE_PER_CPU(int, xmit_recursion);
2594 #define RECURSION_LIMIT 10
2595
2596 /**
2597  *      dev_loopback_xmit - loop back @skb
2598  *      @skb: buffer to transmit
2599  */
2600 int dev_loopback_xmit(struct sk_buff *skb)
2601 {
2602         skb_reset_mac_header(skb);
2603         __skb_pull(skb, skb_network_offset(skb));
2604         skb->pkt_type = PACKET_LOOPBACK;
2605         skb->ip_summed = CHECKSUM_UNNECESSARY;
2606         WARN_ON(!skb_dst(skb));
2607         skb_dst_force(skb);
2608         netif_rx_ni(skb);
2609         return 0;
2610 }
2611 EXPORT_SYMBOL(dev_loopback_xmit);
2612
2613 /**
2614  *      dev_queue_xmit - transmit a buffer
2615  *      @skb: buffer to transmit
2616  *
2617  *      Queue a buffer for transmission to a network device. The caller must
2618  *      have set the device and priority and built the buffer before calling
2619  *      this function. The function can be called from an interrupt.
2620  *
2621  *      A negative errno code is returned on a failure. A success does not
2622  *      guarantee the frame will be transmitted as it may be dropped due
2623  *      to congestion or traffic shaping.
2624  *
2625  * -----------------------------------------------------------------------------------
2626  *      I notice this method can also return errors from the queue disciplines,
2627  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2628  *      be positive.
2629  *
2630  *      Regardless of the return value, the skb is consumed, so it is currently
2631  *      difficult to retry a send to this method.  (You can bump the ref count
2632  *      before sending to hold a reference for retry if you are careful.)
2633  *
2634  *      When calling this method, interrupts MUST be enabled.  This is because
2635  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2636  *          --BLG
2637  */
2638 int dev_queue_xmit(struct sk_buff *skb)
2639 {
2640         struct net_device *dev = skb->dev;
2641         struct netdev_queue *txq;
2642         struct Qdisc *q;
2643         int rc = -ENOMEM;
2644
2645         /* Disable soft irqs for various locks below. Also
2646          * stops preemption for RCU.
2647          */
2648         rcu_read_lock_bh();
2649
2650         skb_update_prio(skb);
2651
2652         txq = netdev_pick_tx(dev, skb);
2653         q = rcu_dereference_bh(txq->qdisc);
2654
2655 #ifdef CONFIG_NET_CLS_ACT
2656         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2657 #endif
2658         trace_net_dev_queue(skb);
2659         if (q->enqueue) {
2660                 rc = __dev_xmit_skb(skb, q, dev, txq);
2661                 goto out;
2662         }
2663
2664         /* The device has no queue. Common case for software devices:
2665            loopback, all the sorts of tunnels...
2666
2667            Really, it is unlikely that netif_tx_lock protection is necessary
2668            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2669            counters.)
2670            However, it is possible, that they rely on protection
2671            made by us here.
2672
2673            Check this and shot the lock. It is not prone from deadlocks.
2674            Either shot noqueue qdisc, it is even simpler 8)
2675          */
2676         if (dev->flags & IFF_UP) {
2677                 int cpu = smp_processor_id(); /* ok because BHs are off */
2678
2679                 if (txq->xmit_lock_owner != cpu) {
2680
2681                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2682                                 goto recursion_alert;
2683
2684                         HARD_TX_LOCK(dev, txq, cpu);
2685
2686                         if (!netif_xmit_stopped(txq)) {
2687                                 __this_cpu_inc(xmit_recursion);
2688                                 rc = dev_hard_start_xmit(skb, dev, txq);
2689                                 __this_cpu_dec(xmit_recursion);
2690                                 if (dev_xmit_complete(rc)) {
2691                                         HARD_TX_UNLOCK(dev, txq);
2692                                         goto out;
2693                                 }
2694                         }
2695                         HARD_TX_UNLOCK(dev, txq);
2696                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2697                                              dev->name);
2698                 } else {
2699                         /* Recursion is detected! It is possible,
2700                          * unfortunately
2701                          */
2702 recursion_alert:
2703                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2704                                              dev->name);
2705                 }
2706         }
2707
2708         rc = -ENETDOWN;
2709         rcu_read_unlock_bh();
2710
2711         kfree_skb(skb);
2712         return rc;
2713 out:
2714         rcu_read_unlock_bh();
2715         return rc;
2716 }
2717 EXPORT_SYMBOL(dev_queue_xmit);
2718
2719
2720 /*=======================================================================
2721                         Receiver routines
2722   =======================================================================*/
2723
2724 int netdev_max_backlog __read_mostly = 1000;
2725 EXPORT_SYMBOL(netdev_max_backlog);
2726
2727 int netdev_tstamp_prequeue __read_mostly = 1;
2728 int netdev_budget __read_mostly = 300;
2729 int weight_p __read_mostly = 64;            /* old backlog weight */
2730
2731 /* Called with irq disabled */
2732 static inline void ____napi_schedule(struct softnet_data *sd,
2733                                      struct napi_struct *napi)
2734 {
2735         list_add_tail(&napi->poll_list, &sd->poll_list);
2736         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2737 }
2738
2739 /*
2740  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2741  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2742  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2743  * if hash is a canonical 4-tuple hash over transport ports.
2744  */
2745 void __skb_get_rxhash(struct sk_buff *skb)
2746 {
2747         struct flow_keys keys;
2748         u32 hash;
2749
2750         if (!skb_flow_dissect(skb, &keys))
2751                 return;
2752
2753         if (keys.ports)
2754                 skb->l4_rxhash = 1;
2755
2756         /* get a consistent hash (same value on both flow directions) */
2757         if (((__force u32)keys.dst < (__force u32)keys.src) ||
2758             (((__force u32)keys.dst == (__force u32)keys.src) &&
2759              ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
2760                 swap(keys.dst, keys.src);
2761                 swap(keys.port16[0], keys.port16[1]);
2762         }
2763
2764         hash = jhash_3words((__force u32)keys.dst,
2765                             (__force u32)keys.src,
2766                             (__force u32)keys.ports, hashrnd);
2767         if (!hash)
2768                 hash = 1;
2769
2770         skb->rxhash = hash;
2771 }
2772 EXPORT_SYMBOL(__skb_get_rxhash);
2773
2774 #ifdef CONFIG_RPS
2775
2776 /* One global table that all flow-based protocols share. */
2777 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2778 EXPORT_SYMBOL(rps_sock_flow_table);
2779
2780 struct static_key rps_needed __read_mostly;
2781
2782 static struct rps_dev_flow *
2783 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2784             struct rps_dev_flow *rflow, u16 next_cpu)
2785 {
2786         if (next_cpu != RPS_NO_CPU) {
2787 #ifdef CONFIG_RFS_ACCEL
2788                 struct netdev_rx_queue *rxqueue;
2789                 struct rps_dev_flow_table *flow_table;
2790                 struct rps_dev_flow *old_rflow;
2791                 u32 flow_id;
2792                 u16 rxq_index;
2793                 int rc;
2794
2795                 /* Should we steer this flow to a different hardware queue? */
2796                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2797                     !(dev->features & NETIF_F_NTUPLE))
2798                         goto out;
2799                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2800                 if (rxq_index == skb_get_rx_queue(skb))
2801                         goto out;
2802
2803                 rxqueue = dev->_rx + rxq_index;
2804                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2805                 if (!flow_table)
2806                         goto out;
2807                 flow_id = skb->rxhash & flow_table->mask;
2808                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2809                                                         rxq_index, flow_id);
2810                 if (rc < 0)
2811                         goto out;
2812                 old_rflow = rflow;
2813                 rflow = &flow_table->flows[flow_id];
2814                 rflow->filter = rc;
2815                 if (old_rflow->filter == rflow->filter)
2816                         old_rflow->filter = RPS_NO_FILTER;
2817         out:
2818 #endif
2819                 rflow->last_qtail =
2820                         per_cpu(softnet_data, next_cpu).input_queue_head;
2821         }
2822
2823         rflow->cpu = next_cpu;
2824         return rflow;
2825 }
2826
2827 /*
2828  * get_rps_cpu is called from netif_receive_skb and returns the target
2829  * CPU from the RPS map of the receiving queue for a given skb.
2830  * rcu_read_lock must be held on entry.
2831  */
2832 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2833                        struct rps_dev_flow **rflowp)
2834 {
2835         struct netdev_rx_queue *rxqueue;
2836         struct rps_map *map;
2837         struct rps_dev_flow_table *flow_table;
2838         struct rps_sock_flow_table *sock_flow_table;
2839         int cpu = -1;
2840         u16 tcpu;
2841
2842         if (skb_rx_queue_recorded(skb)) {
2843                 u16 index = skb_get_rx_queue(skb);
2844                 if (unlikely(index >= dev->real_num_rx_queues)) {
2845                         WARN_ONCE(dev->real_num_rx_queues > 1,
2846                                   "%s received packet on queue %u, but number "
2847                                   "of RX queues is %u\n",
2848                                   dev->name, index, dev->real_num_rx_queues);
2849                         goto done;
2850                 }
2851                 rxqueue = dev->_rx + index;
2852         } else
2853                 rxqueue = dev->_rx;
2854
2855         map = rcu_dereference(rxqueue->rps_map);
2856         if (map) {
2857                 if (map->len == 1 &&
2858                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2859                         tcpu = map->cpus[0];
2860                         if (cpu_online(tcpu))
2861                                 cpu = tcpu;
2862                         goto done;
2863                 }
2864         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2865                 goto done;
2866         }
2867
2868         skb_reset_network_header(skb);
2869         if (!skb_get_rxhash(skb))
2870                 goto done;
2871
2872         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2873         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2874         if (flow_table && sock_flow_table) {
2875                 u16 next_cpu;
2876                 struct rps_dev_flow *rflow;
2877
2878                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2879                 tcpu = rflow->cpu;
2880
2881                 next_cpu = sock_flow_table->ents[skb->rxhash &
2882                     sock_flow_table->mask];
2883
2884                 /*
2885                  * If the desired CPU (where last recvmsg was done) is
2886                  * different from current CPU (one in the rx-queue flow
2887                  * table entry), switch if one of the following holds:
2888                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2889                  *   - Current CPU is offline.
2890                  *   - The current CPU's queue tail has advanced beyond the
2891                  *     last packet that was enqueued using this table entry.
2892                  *     This guarantees that all previous packets for the flow
2893                  *     have been dequeued, thus preserving in order delivery.
2894                  */
2895                 if (unlikely(tcpu != next_cpu) &&
2896                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2897                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2898                       rflow->last_qtail)) >= 0)) {
2899                         tcpu = next_cpu;
2900                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2901                 }
2902
2903                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2904                         *rflowp = rflow;
2905                         cpu = tcpu;
2906                         goto done;
2907                 }
2908         }
2909
2910         if (map) {
2911                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2912
2913                 if (cpu_online(tcpu)) {
2914                         cpu = tcpu;
2915                         goto done;
2916                 }
2917         }
2918
2919 done:
2920         return cpu;
2921 }
2922
2923 #ifdef CONFIG_RFS_ACCEL
2924
2925 /**
2926  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2927  * @dev: Device on which the filter was set
2928  * @rxq_index: RX queue index
2929  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2930  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2931  *
2932  * Drivers that implement ndo_rx_flow_steer() should periodically call
2933  * this function for each installed filter and remove the filters for
2934  * which it returns %true.
2935  */
2936 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2937                          u32 flow_id, u16 filter_id)
2938 {
2939         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2940         struct rps_dev_flow_table *flow_table;
2941         struct rps_dev_flow *rflow;
2942         bool expire = true;
2943         int cpu;
2944
2945         rcu_read_lock();
2946         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2947         if (flow_table && flow_id <= flow_table->mask) {
2948                 rflow = &flow_table->flows[flow_id];
2949                 cpu = ACCESS_ONCE(rflow->cpu);
2950                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2951                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2952                            rflow->last_qtail) <
2953                      (int)(10 * flow_table->mask)))
2954                         expire = false;
2955         }
2956         rcu_read_unlock();
2957         return expire;
2958 }
2959 EXPORT_SYMBOL(rps_may_expire_flow);
2960
2961 #endif /* CONFIG_RFS_ACCEL */
2962
2963 /* Called from hardirq (IPI) context */
2964 static void rps_trigger_softirq(void *data)
2965 {
2966         struct softnet_data *sd = data;
2967
2968         ____napi_schedule(sd, &sd->backlog);
2969         sd->received_rps++;
2970 }
2971
2972 #endif /* CONFIG_RPS */
2973
2974 /*
2975  * Check if this softnet_data structure is another cpu one
2976  * If yes, queue it to our IPI list and return 1
2977  * If no, return 0
2978  */
2979 static int rps_ipi_queued(struct softnet_data *sd)
2980 {
2981 #ifdef CONFIG_RPS
2982         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2983
2984         if (sd != mysd) {
2985                 sd->rps_ipi_next = mysd->rps_ipi_list;
2986                 mysd->rps_ipi_list = sd;
2987
2988                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2989                 return 1;
2990         }
2991 #endif /* CONFIG_RPS */
2992         return 0;
2993 }
2994
2995 /*
2996  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2997  * queue (may be a remote CPU queue).
2998  */
2999 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3000                               unsigned int *qtail)
3001 {
3002         struct softnet_data *sd;
3003         unsigned long flags;
3004
3005         sd = &per_cpu(softnet_data, cpu);
3006
3007         local_irq_save(flags);
3008
3009         rps_lock(sd);
3010         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3011                 if (skb_queue_len(&sd->input_pkt_queue)) {
3012 enqueue:
3013                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3014                         input_queue_tail_incr_save(sd, qtail);
3015                         rps_unlock(sd);
3016                         local_irq_restore(flags);
3017                         return NET_RX_SUCCESS;
3018                 }
3019
3020                 /* Schedule NAPI for backlog device
3021                  * We can use non atomic operation since we own the queue lock
3022                  */
3023                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3024                         if (!rps_ipi_queued(sd))
3025                                 ____napi_schedule(sd, &sd->backlog);
3026                 }
3027                 goto enqueue;
3028         }
3029
3030         sd->dropped++;
3031         rps_unlock(sd);
3032
3033         local_irq_restore(flags);
3034
3035         atomic_long_inc(&skb->dev->rx_dropped);
3036         kfree_skb(skb);
3037         return NET_RX_DROP;
3038 }
3039
3040 /**
3041  *      netif_rx        -       post buffer to the network code
3042  *      @skb: buffer to post
3043  *
3044  *      This function receives a packet from a device driver and queues it for
3045  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3046  *      may be dropped during processing for congestion control or by the
3047  *      protocol layers.
3048  *
3049  *      return values:
3050  *      NET_RX_SUCCESS  (no congestion)
3051  *      NET_RX_DROP     (packet was dropped)
3052  *
3053  */
3054
3055 int netif_rx(struct sk_buff *skb)
3056 {
3057         int ret;
3058
3059         /* if netpoll wants it, pretend we never saw it */
3060         if (netpoll_rx(skb))
3061                 return NET_RX_DROP;
3062
3063         net_timestamp_check(netdev_tstamp_prequeue, skb);
3064
3065         trace_netif_rx(skb);
3066 #ifdef CONFIG_RPS
3067         if (static_key_false(&rps_needed)) {
3068                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3069                 int cpu;
3070
3071                 preempt_disable();
3072                 rcu_read_lock();
3073
3074                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3075                 if (cpu < 0)
3076                         cpu = smp_processor_id();
3077
3078                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3079
3080                 rcu_read_unlock();
3081                 preempt_enable();
3082         } else
3083 #endif
3084         {
3085                 unsigned int qtail;
3086                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3087                 put_cpu();
3088         }
3089         return ret;
3090 }
3091 EXPORT_SYMBOL(netif_rx);
3092
3093 int netif_rx_ni(struct sk_buff *skb)
3094 {
3095         int err;
3096
3097         preempt_disable();
3098         err = netif_rx(skb);
3099         if (local_softirq_pending())
3100                 do_softirq();
3101         preempt_enable();
3102
3103         return err;
3104 }
3105 EXPORT_SYMBOL(netif_rx_ni);
3106
3107 static void net_tx_action(struct softirq_action *h)
3108 {
3109         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3110
3111         if (sd->completion_queue) {
3112                 struct sk_buff *clist;
3113
3114                 local_irq_disable();
3115                 clist = sd->completion_queue;
3116                 sd->completion_queue = NULL;
3117                 local_irq_enable();
3118
3119                 while (clist) {
3120                         struct sk_buff *skb = clist;
3121                         clist = clist->next;
3122
3123                         WARN_ON(atomic_read(&skb->users));
3124                         trace_kfree_skb(skb, net_tx_action);
3125                         __kfree_skb(skb);
3126                 }
3127         }
3128
3129         if (sd->output_queue) {
3130                 struct Qdisc *head;
3131
3132                 local_irq_disable();
3133                 head = sd->output_queue;
3134                 sd->output_queue = NULL;
3135                 sd->output_queue_tailp = &sd->output_queue;
3136                 local_irq_enable();
3137
3138                 while (head) {
3139                         struct Qdisc *q = head;
3140                         spinlock_t *root_lock;
3141
3142                         head = head->next_sched;
3143
3144                         root_lock = qdisc_lock(q);
3145                         if (spin_trylock(root_lock)) {
3146                                 smp_mb__before_clear_bit();
3147                                 clear_bit(__QDISC_STATE_SCHED,
3148                                           &q->state);
3149                                 qdisc_run(q);
3150                                 spin_unlock(root_lock);
3151                         } else {
3152                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3153                                               &q->state)) {
3154                                         __netif_reschedule(q);
3155                                 } else {
3156                                         smp_mb__before_clear_bit();
3157                                         clear_bit(__QDISC_STATE_SCHED,
3158                                                   &q->state);
3159                                 }
3160                         }
3161                 }
3162         }
3163 }
3164
3165 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3166     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3167 /* This hook is defined here for ATM LANE */
3168 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3169                              unsigned char *addr) __read_mostly;
3170 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3171 #endif
3172
3173 #ifdef CONFIG_NET_CLS_ACT
3174 /* TODO: Maybe we should just force sch_ingress to be compiled in
3175  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3176  * a compare and 2 stores extra right now if we dont have it on
3177  * but have CONFIG_NET_CLS_ACT
3178  * NOTE: This doesn't stop any functionality; if you dont have
3179  * the ingress scheduler, you just can't add policies on ingress.
3180  *
3181  */
3182 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3183 {
3184         struct net_device *dev = skb->dev;
3185         u32 ttl = G_TC_RTTL(skb->tc_verd);
3186         int result = TC_ACT_OK;
3187         struct Qdisc *q;
3188
3189         if (unlikely(MAX_RED_LOOP < ttl++)) {
3190                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3191                                      skb->skb_iif, dev->ifindex);
3192                 return TC_ACT_SHOT;
3193         }
3194
3195         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3196         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3197
3198         q = rxq->qdisc;
3199         if (q != &noop_qdisc) {
3200                 spin_lock(qdisc_lock(q));
3201                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3202                         result = qdisc_enqueue_root(skb, q);
3203                 spin_unlock(qdisc_lock(q));
3204         }
3205
3206         return result;
3207 }
3208
3209 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3210                                          struct packet_type **pt_prev,
3211                                          int *ret, struct net_device *orig_dev)
3212 {
3213         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3214
3215         if (!rxq || rxq->qdisc == &noop_qdisc)
3216                 goto out;
3217
3218         if (*pt_prev) {
3219                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3220                 *pt_prev = NULL;
3221         }
3222
3223         switch (ing_filter(skb, rxq)) {
3224         case TC_ACT_SHOT:
3225         case TC_ACT_STOLEN:
3226                 kfree_skb(skb);
3227                 return NULL;
3228         }
3229
3230 out:
3231         skb->tc_verd = 0;
3232         return skb;
3233 }
3234 #endif
3235
3236 /**
3237  *      netdev_rx_handler_register - register receive handler
3238  *      @dev: device to register a handler for
3239  *      @rx_handler: receive handler to register
3240  *      @rx_handler_data: data pointer that is used by rx handler
3241  *
3242  *      Register a receive hander for a device. This handler will then be
3243  *      called from __netif_receive_skb. A negative errno code is returned
3244  *      on a failure.
3245  *
3246  *      The caller must hold the rtnl_mutex.
3247  *
3248  *      For a general description of rx_handler, see enum rx_handler_result.
3249  */
3250 int netdev_rx_handler_register(struct net_device *dev,
3251                                rx_handler_func_t *rx_handler,
3252                                void *rx_handler_data)
3253 {
3254         ASSERT_RTNL();
3255
3256         if (dev->rx_handler)
3257                 return -EBUSY;
3258
3259         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3260         rcu_assign_pointer(dev->rx_handler, rx_handler);
3261
3262         return 0;
3263 }
3264 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3265
3266 /**
3267  *      netdev_rx_handler_unregister - unregister receive handler
3268  *      @dev: device to unregister a handler from
3269  *
3270  *      Unregister a receive hander from a device.
3271  *
3272  *      The caller must hold the rtnl_mutex.
3273  */
3274 void netdev_rx_handler_unregister(struct net_device *dev)
3275 {
3276
3277         ASSERT_RTNL();
3278         RCU_INIT_POINTER(dev->rx_handler, NULL);
3279         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3280 }
3281 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3282
3283 /*
3284  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3285  * the special handling of PFMEMALLOC skbs.
3286  */
3287 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3288 {
3289         switch (skb->protocol) {
3290         case __constant_htons(ETH_P_ARP):
3291         case __constant_htons(ETH_P_IP):
3292         case __constant_htons(ETH_P_IPV6):
3293         case __constant_htons(ETH_P_8021Q):
3294                 return true;
3295         default:
3296                 return false;
3297         }
3298 }
3299
3300 static int __netif_receive_skb(struct sk_buff *skb)
3301 {
3302         struct packet_type *ptype, *pt_prev;
3303         rx_handler_func_t *rx_handler;
3304         struct net_device *orig_dev;
3305         struct net_device *null_or_dev;
3306         bool deliver_exact = false;
3307         int ret = NET_RX_DROP;
3308         __be16 type;
3309         unsigned long pflags = current->flags;
3310
3311         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3312
3313         trace_netif_receive_skb(skb);
3314
3315         /*
3316          * PFMEMALLOC skbs are special, they should
3317          * - be delivered to SOCK_MEMALLOC sockets only
3318          * - stay away from userspace
3319          * - have bounded memory usage
3320          *
3321          * Use PF_MEMALLOC as this saves us from propagating the allocation
3322          * context down to all allocation sites.
3323          */
3324         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3325                 current->flags |= PF_MEMALLOC;
3326
3327         /* if we've gotten here through NAPI, check netpoll */
3328         if (netpoll_receive_skb(skb))
3329                 goto out;
3330
3331         orig_dev = skb->dev;
3332
3333         skb_reset_network_header(skb);
3334         skb_reset_transport_header(skb);
3335         skb_reset_mac_len(skb);
3336
3337         pt_prev = NULL;
3338
3339         rcu_read_lock();
3340
3341 another_round:
3342         skb->skb_iif = skb->dev->ifindex;
3343
3344         __this_cpu_inc(softnet_data.processed);
3345
3346         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3347                 skb = vlan_untag(skb);
3348                 if (unlikely(!skb))
3349                         goto unlock;
3350         }
3351
3352 #ifdef CONFIG_NET_CLS_ACT
3353         if (skb->tc_verd & TC_NCLS) {
3354                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3355                 goto ncls;
3356         }
3357 #endif
3358
3359         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3360                 goto skip_taps;
3361
3362         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3363                 if (!ptype->dev || ptype->dev == skb->dev) {
3364                         if (pt_prev)
3365                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3366                         pt_prev = ptype;
3367                 }
3368         }
3369
3370 skip_taps:
3371 #ifdef CONFIG_NET_CLS_ACT
3372         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3373         if (!skb)
3374                 goto unlock;
3375 ncls:
3376 #endif
3377
3378         if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3379                                 && !skb_pfmemalloc_protocol(skb))
3380                 goto drop;
3381
3382         if (vlan_tx_tag_present(skb)) {
3383                 if (pt_prev) {
3384                         ret = deliver_skb(skb, pt_prev, orig_dev);
3385                         pt_prev = NULL;
3386                 }
3387                 if (vlan_do_receive(&skb))
3388                         goto another_round;
3389                 else if (unlikely(!skb))
3390                         goto unlock;
3391         }
3392
3393         rx_handler = rcu_dereference(skb->dev->rx_handler);
3394         if (rx_handler) {
3395                 if (pt_prev) {
3396                         ret = deliver_skb(skb, pt_prev, orig_dev);
3397                         pt_prev = NULL;
3398                 }
3399                 switch (rx_handler(&skb)) {
3400                 case RX_HANDLER_CONSUMED:
3401                         goto unlock;
3402                 case RX_HANDLER_ANOTHER:
3403                         goto another_round;
3404                 case RX_HANDLER_EXACT:
3405                         deliver_exact = true;
3406                 case RX_HANDLER_PASS:
3407                         break;
3408                 default:
3409                         BUG();
3410                 }
3411         }
3412
3413         if (vlan_tx_nonzero_tag_present(skb))
3414                 skb->pkt_type = PACKET_OTHERHOST;
3415
3416         /* deliver only exact match when indicated */
3417         null_or_dev = deliver_exact ? skb->dev : NULL;
3418
3419         type = skb->protocol;
3420         list_for_each_entry_rcu(ptype,
3421                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3422                 if (ptype->type == type &&
3423                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3424                      ptype->dev == orig_dev)) {
3425                         if (pt_prev)
3426                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3427                         pt_prev = ptype;
3428                 }
3429         }
3430
3431         if (pt_prev) {
3432                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3433                         goto drop;
3434                 else
3435                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3436         } else {
3437 drop:
3438                 atomic_long_inc(&skb->dev->rx_dropped);
3439                 kfree_skb(skb);
3440                 /* Jamal, now you will not able to escape explaining
3441                  * me how you were going to use this. :-)
3442                  */
3443                 ret = NET_RX_DROP;
3444         }
3445
3446 unlock:
3447         rcu_read_unlock();
3448 out:
3449         tsk_restore_flags(current, pflags, PF_MEMALLOC);
3450         return ret;
3451 }
3452
3453 /**
3454  *      netif_receive_skb - process receive buffer from network
3455  *      @skb: buffer to process
3456  *
3457  *      netif_receive_skb() is the main receive data processing function.
3458  *      It always succeeds. The buffer may be dropped during processing
3459  *      for congestion control or by the protocol layers.
3460  *
3461  *      This function may only be called from softirq context and interrupts
3462  *      should be enabled.
3463  *
3464  *      Return values (usually ignored):
3465  *      NET_RX_SUCCESS: no congestion
3466  *      NET_RX_DROP: packet was dropped
3467  */
3468 int netif_receive_skb(struct sk_buff *skb)
3469 {
3470         net_timestamp_check(netdev_tstamp_prequeue, skb);
3471
3472         if (skb_defer_rx_timestamp(skb))
3473                 return NET_RX_SUCCESS;
3474
3475 #ifdef CONFIG_RPS
3476         if (static_key_false(&rps_needed)) {
3477                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3478                 int cpu, ret;
3479
3480                 rcu_read_lock();
3481
3482                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3483
3484                 if (cpu >= 0) {
3485                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3486                         rcu_read_unlock();
3487                         return ret;
3488                 }
3489                 rcu_read_unlock();
3490         }
3491 #endif
3492         return __netif_receive_skb(skb);
3493 }
3494 EXPORT_SYMBOL(netif_receive_skb);
3495
3496 /* Network device is going away, flush any packets still pending
3497  * Called with irqs disabled.
3498  */
3499 static void flush_backlog(void *arg)
3500 {
3501         struct net_device *dev = arg;
3502         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3503         struct sk_buff *skb, *tmp;
3504
3505         rps_lock(sd);
3506         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3507                 if (skb->dev == dev) {
3508                         __skb_unlink(skb, &sd->input_pkt_queue);
3509                         kfree_skb(skb);
3510                         input_queue_head_incr(sd);
3511                 }
3512         }
3513         rps_unlock(sd);
3514
3515         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3516                 if (skb->dev == dev) {
3517                         __skb_unlink(skb, &sd->process_queue);
3518                         kfree_skb(skb);
3519                         input_queue_head_incr(sd);
3520                 }
3521         }
3522 }
3523
3524 static int napi_gro_complete(struct sk_buff *skb)
3525 {
3526         struct packet_offload *ptype;
3527         __be16 type = skb->protocol;
3528         struct list_head *head = &offload_base;
3529         int err = -ENOENT;
3530
3531         if (NAPI_GRO_CB(skb)->count == 1) {
3532                 skb_shinfo(skb)->gso_size = 0;
3533                 goto out;
3534         }
3535
3536         rcu_read_lock();
3537         list_for_each_entry_rcu(ptype, head, list) {
3538                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3539                         continue;
3540
3541                 err = ptype->callbacks.gro_complete(skb);
3542                 break;
3543         }
3544         rcu_read_unlock();
3545
3546         if (err) {
3547                 WARN_ON(&ptype->list == head);
3548                 kfree_skb(skb);
3549                 return NET_RX_SUCCESS;
3550         }
3551
3552 out:
3553         return netif_receive_skb(skb);
3554 }
3555
3556 /* napi->gro_list contains packets ordered by age.
3557  * youngest packets at the head of it.
3558  * Complete skbs in reverse order to reduce latencies.
3559  */
3560 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3561 {
3562         struct sk_buff *skb, *prev = NULL;
3563
3564         /* scan list and build reverse chain */
3565         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3566                 skb->prev = prev;
3567                 prev = skb;
3568         }
3569
3570         for (skb = prev; skb; skb = prev) {
3571                 skb->next = NULL;
3572
3573                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3574                         return;
3575
3576                 prev = skb->prev;
3577                 napi_gro_complete(skb);
3578                 napi->gro_count--;
3579         }
3580
3581         napi->gro_list = NULL;
3582 }
3583 EXPORT_SYMBOL(napi_gro_flush);
3584
3585 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3586 {
3587         struct sk_buff **pp = NULL;
3588         struct packet_offload *ptype;
3589         __be16 type = skb->protocol;
3590         struct list_head *head = &offload_base;
3591         int same_flow;
3592         int mac_len;
3593         enum gro_result ret;
3594
3595         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3596                 goto normal;
3597
3598         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3599                 goto normal;
3600
3601         rcu_read_lock();
3602         list_for_each_entry_rcu(ptype, head, list) {
3603                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3604                         continue;
3605
3606                 skb_set_network_header(skb, skb_gro_offset(skb));
3607                 mac_len = skb->network_header - skb->mac_header;
3608                 skb->mac_len = mac_len;
3609                 NAPI_GRO_CB(skb)->same_flow = 0;
3610                 NAPI_GRO_CB(skb)->flush = 0;
3611                 NAPI_GRO_CB(skb)->free = 0;
3612
3613                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3614                 break;
3615         }
3616         rcu_read_unlock();
3617
3618         if (&ptype->list == head)
3619                 goto normal;
3620
3621         same_flow = NAPI_GRO_CB(skb)->same_flow;
3622         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3623
3624         if (pp) {
3625                 struct sk_buff *nskb = *pp;
3626
3627                 *pp = nskb->next;
3628                 nskb->next = NULL;
3629                 napi_gro_complete(nskb);
3630                 napi->gro_count--;
3631         }
3632
3633         if (same_flow)
3634                 goto ok;
3635
3636         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3637                 goto normal;
3638
3639         napi->gro_count++;
3640         NAPI_GRO_CB(skb)->count = 1;
3641         NAPI_GRO_CB(skb)->age = jiffies;
3642         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3643         skb->next = napi->gro_list;
3644         napi->gro_list = skb;
3645         ret = GRO_HELD;
3646
3647 pull:
3648         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3649                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3650
3651                 BUG_ON(skb->end - skb->tail < grow);
3652
3653                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3654
3655                 skb->tail += grow;
3656                 skb->data_len -= grow;
3657
3658                 skb_shinfo(skb)->frags[0].page_offset += grow;
3659                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3660
3661                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3662                         skb_frag_unref(skb, 0);
3663                         memmove(skb_shinfo(skb)->frags,
3664                                 skb_shinfo(skb)->frags + 1,
3665                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3666                 }
3667         }
3668
3669 ok:
3670         return ret;
3671
3672 normal:
3673         ret = GRO_NORMAL;
3674         goto pull;
3675 }
3676 EXPORT_SYMBOL(dev_gro_receive);
3677
3678 static inline gro_result_t
3679 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3680 {
3681         struct sk_buff *p;
3682         unsigned int maclen = skb->dev->hard_header_len;
3683
3684         for (p = napi->gro_list; p; p = p->next) {
3685                 unsigned long diffs;
3686
3687                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3688                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3689                 if (maclen == ETH_HLEN)
3690                         diffs |= compare_ether_header(skb_mac_header(p),
3691                                                       skb_gro_mac_header(skb));
3692                 else if (!diffs)
3693                         diffs = memcmp(skb_mac_header(p),
3694                                        skb_gro_mac_header(skb),
3695                                        maclen);
3696                 NAPI_GRO_CB(p)->same_flow = !diffs;
3697                 NAPI_GRO_CB(p)->flush = 0;
3698         }
3699
3700         return dev_gro_receive(napi, skb);
3701 }
3702
3703 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3704 {
3705         switch (ret) {
3706         case GRO_NORMAL:
3707                 if (netif_receive_skb(skb))
3708                         ret = GRO_DROP;
3709                 break;
3710
3711         case GRO_DROP:
3712                 kfree_skb(skb);
3713                 break;
3714
3715         case GRO_MERGED_FREE:
3716                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3717                         kmem_cache_free(skbuff_head_cache, skb);
3718                 else
3719                         __kfree_skb(skb);
3720                 break;
3721
3722         case GRO_HELD:
3723         case GRO_MERGED:
3724                 break;
3725         }
3726
3727         return ret;
3728 }
3729 EXPORT_SYMBOL(napi_skb_finish);
3730
3731 static void skb_gro_reset_offset(struct sk_buff *skb)
3732 {
3733         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3734         const skb_frag_t *frag0 = &pinfo->frags[0];
3735
3736         NAPI_GRO_CB(skb)->data_offset = 0;
3737         NAPI_GRO_CB(skb)->frag0 = NULL;
3738         NAPI_GRO_CB(skb)->frag0_len = 0;
3739
3740         if (skb->mac_header == skb->tail &&
3741             pinfo->nr_frags &&
3742             !PageHighMem(skb_frag_page(frag0))) {
3743                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3744                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3745         }
3746 }
3747
3748 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3749 {
3750         skb_gro_reset_offset(skb);
3751
3752         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3753 }
3754 EXPORT_SYMBOL(napi_gro_receive);
3755
3756 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3757 {
3758         __skb_pull(skb, skb_headlen(skb));
3759         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3760         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3761         skb->vlan_tci = 0;
3762         skb->dev = napi->dev;
3763         skb->skb_iif = 0;
3764
3765         napi->skb = skb;
3766 }
3767
3768 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3769 {
3770         struct sk_buff *skb = napi->skb;
3771
3772         if (!skb) {
3773                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3774                 if (skb)
3775                         napi->skb = skb;
3776         }
3777         return skb;
3778 }
3779 EXPORT_SYMBOL(napi_get_frags);
3780
3781 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3782                                gro_result_t ret)
3783 {
3784         switch (ret) {
3785         case GRO_NORMAL:
3786         case GRO_HELD:
3787                 skb->protocol = eth_type_trans(skb, skb->dev);
3788
3789                 if (ret == GRO_HELD)
3790                         skb_gro_pull(skb, -ETH_HLEN);
3791                 else if (netif_receive_skb(skb))
3792                         ret = GRO_DROP;
3793                 break;
3794
3795         case GRO_DROP:
3796         case GRO_MERGED_FREE:
3797                 napi_reuse_skb(napi, skb);
3798                 break;
3799
3800         case GRO_MERGED:
3801                 break;
3802         }
3803
3804         return ret;
3805 }
3806 EXPORT_SYMBOL(napi_frags_finish);
3807
3808 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3809 {
3810         struct sk_buff *skb = napi->skb;
3811         struct ethhdr *eth;
3812         unsigned int hlen;
3813         unsigned int off;
3814
3815         napi->skb = NULL;
3816
3817         skb_reset_mac_header(skb);
3818         skb_gro_reset_offset(skb);
3819
3820         off = skb_gro_offset(skb);
3821         hlen = off + sizeof(*eth);
3822         eth = skb_gro_header_fast(skb, off);
3823         if (skb_gro_header_hard(skb, hlen)) {
3824                 eth = skb_gro_header_slow(skb, hlen, off);
3825                 if (unlikely(!eth)) {
3826                         napi_reuse_skb(napi, skb);
3827                         skb = NULL;
3828                         goto out;
3829                 }
3830         }
3831
3832         skb_gro_pull(skb, sizeof(*eth));
3833
3834         /*
3835          * This works because the only protocols we care about don't require
3836          * special handling.  We'll fix it up properly at the end.
3837          */
3838         skb->protocol = eth->h_proto;
3839
3840 out:
3841         return skb;
3842 }
3843
3844 gro_result_t napi_gro_frags(struct napi_struct *napi)
3845 {
3846         struct sk_buff *skb = napi_frags_skb(napi);
3847
3848         if (!skb)
3849                 return GRO_DROP;
3850
3851         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3852 }
3853 EXPORT_SYMBOL(napi_gro_frags);
3854
3855 /*
3856  * net_rps_action sends any pending IPI's for rps.
3857  * Note: called with local irq disabled, but exits with local irq enabled.
3858  */
3859 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3860 {
3861 #ifdef CONFIG_RPS
3862         struct softnet_data *remsd = sd->rps_ipi_list;
3863
3864         if (remsd) {
3865                 sd->rps_ipi_list = NULL;
3866
3867                 local_irq_enable();
3868
3869                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3870                 while (remsd) {
3871                         struct softnet_data *next = remsd->rps_ipi_next;
3872
3873                         if (cpu_online(remsd->cpu))
3874                                 __smp_call_function_single(remsd->cpu,
3875                                                            &remsd->csd, 0);
3876                         remsd = next;
3877                 }
3878         } else
3879 #endif
3880                 local_irq_enable();
3881 }
3882
3883 static int process_backlog(struct napi_struct *napi, int quota)
3884 {
3885         int work = 0;
3886         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3887
3888 #ifdef CONFIG_RPS
3889         /* Check if we have pending ipi, its better to send them now,
3890          * not waiting net_rx_action() end.
3891          */
3892         if (sd->rps_ipi_list) {
3893                 local_irq_disable();
3894                 net_rps_action_and_irq_enable(sd);
3895         }
3896 #endif
3897         napi->weight = weight_p;
3898         local_irq_disable();
3899         while (work < quota) {
3900                 struct sk_buff *skb;
3901                 unsigned int qlen;
3902
3903                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3904                         local_irq_enable();
3905                         __netif_receive_skb(skb);
3906                         local_irq_disable();
3907                         input_queue_head_incr(sd);
3908                         if (++work >= quota) {
3909                                 local_irq_enable();
3910                                 return work;
3911                         }
3912                 }
3913
3914                 rps_lock(sd);
3915                 qlen = skb_queue_len(&sd->input_pkt_queue);
3916                 if (qlen)
3917                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3918                                                    &sd->process_queue);
3919
3920                 if (qlen < quota - work) {
3921                         /*
3922                          * Inline a custom version of __napi_complete().
3923                          * only current cpu owns and manipulates this napi,
3924                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3925                          * we can use a plain write instead of clear_bit(),
3926                          * and we dont need an smp_mb() memory barrier.
3927                          */
3928                         list_del(&napi->poll_list);
3929                         napi->state = 0;
3930
3931                         quota = work + qlen;
3932                 }
3933                 rps_unlock(sd);
3934         }
3935         local_irq_enable();
3936
3937         return work;
3938 }
3939
3940 /**
3941  * __napi_schedule - schedule for receive
3942  * @n: entry to schedule
3943  *
3944  * The entry's receive function will be scheduled to run
3945  */
3946 void __napi_schedule(struct napi_struct *n)
3947 {
3948         unsigned long flags;
3949
3950         local_irq_save(flags);
3951         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3952         local_irq_restore(flags);
3953 }
3954 EXPORT_SYMBOL(__napi_schedule);
3955
3956 void __napi_complete(struct napi_struct *n)
3957 {
3958         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3959         BUG_ON(n->gro_list);
3960
3961         list_del(&n->poll_list);
3962         smp_mb__before_clear_bit();
3963         clear_bit(NAPI_STATE_SCHED, &n->state);
3964 }
3965 EXPORT_SYMBOL(__napi_complete);
3966
3967 void napi_complete(struct napi_struct *n)
3968 {
3969         unsigned long flags;
3970
3971         /*
3972          * don't let napi dequeue from the cpu poll list
3973          * just in case its running on a different cpu
3974          */
3975         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3976                 return;
3977
3978         napi_gro_flush(n, false);
3979         local_irq_save(flags);
3980         __napi_complete(n);
3981         local_irq_restore(flags);
3982 }
3983 EXPORT_SYMBOL(napi_complete);
3984
3985 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3986                     int (*poll)(struct napi_struct *, int), int weight)
3987 {
3988         INIT_LIST_HEAD(&napi->poll_list);
3989         napi->gro_count = 0;
3990         napi->gro_list = NULL;
3991         napi->skb = NULL;
3992         napi->poll = poll;
3993         napi->weight = weight;
3994         list_add(&napi->dev_list, &dev->napi_list);
3995         napi->dev = dev;
3996 #ifdef CONFIG_NETPOLL
3997         spin_lock_init(&napi->poll_lock);
3998         napi->poll_owner = -1;
3999 #endif
4000         set_bit(NAPI_STATE_SCHED, &napi->state);
4001 }
4002 EXPORT_SYMBOL(netif_napi_add);
4003
4004 void netif_napi_del(struct napi_struct *napi)
4005 {
4006         struct sk_buff *skb, *next;
4007
4008         list_del_init(&napi->dev_list);
4009         napi_free_frags(napi);
4010
4011         for (skb = napi->gro_list; skb; skb = next) {
4012                 next = skb->next;
4013                 skb->next = NULL;
4014                 kfree_skb(skb);
4015         }
4016
4017         napi->gro_list = NULL;
4018         napi->gro_count = 0;
4019 }
4020 EXPORT_SYMBOL(netif_napi_del);
4021
4022 static void net_rx_action(struct softirq_action *h)
4023 {
4024         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4025         unsigned long time_limit = jiffies + 2;
4026         int budget = netdev_budget;
4027         void *have;
4028
4029         local_irq_disable();
4030
4031         while (!list_empty(&sd->poll_list)) {
4032                 struct napi_struct *n;
4033                 int work, weight;
4034
4035                 /* If softirq window is exhuasted then punt.
4036                  * Allow this to run for 2 jiffies since which will allow
4037                  * an average latency of 1.5/HZ.
4038                  */
4039                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4040                         goto softnet_break;
4041
4042                 local_irq_enable();
4043
4044                 /* Even though interrupts have been re-enabled, this
4045                  * access is safe because interrupts can only add new
4046                  * entries to the tail of this list, and only ->poll()
4047                  * calls can remove this head entry from the list.
4048                  */
4049                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4050
4051                 have = netpoll_poll_lock(n);
4052
4053                 weight = n->weight;
4054
4055                 /* This NAPI_STATE_SCHED test is for avoiding a race
4056                  * with netpoll's poll_napi().  Only the entity which
4057                  * obtains the lock and sees NAPI_STATE_SCHED set will
4058                  * actually make the ->poll() call.  Therefore we avoid
4059                  * accidentally calling ->poll() when NAPI is not scheduled.
4060                  */
4061                 work = 0;
4062                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4063                         work = n->poll(n, weight);
4064                         trace_napi_poll(n);
4065                 }
4066
4067                 WARN_ON_ONCE(work > weight);
4068
4069                 budget -= work;
4070
4071                 local_irq_disable();
4072
4073                 /* Drivers must not modify the NAPI state if they
4074                  * consume the entire weight.  In such cases this code
4075                  * still "owns" the NAPI instance and therefore can
4076                  * move the instance around on the list at-will.
4077                  */
4078                 if (unlikely(work == weight)) {
4079                         if (unlikely(napi_disable_pending(n))) {
4080                                 local_irq_enable();
4081                                 napi_complete(n);
4082                                 local_irq_disable();
4083                         } else {
4084                                 if (n->gro_list) {
4085                                         /* flush too old packets
4086                                          * If HZ < 1000, flush all packets.
4087                                          */
4088                                         local_irq_enable();
4089                                         napi_gro_flush(n, HZ >= 1000);
4090                                         local_irq_disable();
4091                                 }
4092                                 list_move_tail(&n->poll_list, &sd->poll_list);
4093                         }
4094                 }
4095
4096                 netpoll_poll_unlock(have);
4097         }
4098 out:
4099         net_rps_action_and_irq_enable(sd);
4100
4101 #ifdef CONFIG_NET_DMA
4102         /*
4103          * There may not be any more sk_buffs coming right now, so push
4104          * any pending DMA copies to hardware
4105          */
4106         dma_issue_pending_all();
4107 #endif
4108
4109         return;
4110
4111 softnet_break:
4112         sd->time_squeeze++;
4113         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4114         goto out;
4115 }
4116
4117 static gifconf_func_t *gifconf_list[NPROTO];
4118
4119 /**
4120  *      register_gifconf        -       register a SIOCGIF handler
4121  *      @family: Address family
4122  *      @gifconf: Function handler
4123  *
4124  *      Register protocol dependent address dumping routines. The handler
4125  *      that is passed must not be freed or reused until it has been replaced
4126  *      by another handler.
4127  */
4128 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4129 {
4130         if (family >= NPROTO)
4131                 return -EINVAL;
4132         gifconf_list[family] = gifconf;
4133         return 0;
4134 }
4135 EXPORT_SYMBOL(register_gifconf);
4136
4137
4138 /*
4139  *      Map an interface index to its name (SIOCGIFNAME)
4140  */
4141
4142 /*
4143  *      We need this ioctl for efficient implementation of the
4144  *      if_indextoname() function required by the IPv6 API.  Without
4145  *      it, we would have to search all the interfaces to find a
4146  *      match.  --pb
4147  */
4148
4149 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4150 {
4151         struct net_device *dev;
4152         struct ifreq ifr;
4153
4154         /*
4155          *      Fetch the caller's info block.
4156          */
4157
4158         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4159                 return -EFAULT;
4160
4161         rcu_read_lock();
4162         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4163         if (!dev) {
4164                 rcu_read_unlock();
4165                 return -ENODEV;
4166         }
4167
4168         strcpy(ifr.ifr_name, dev->name);
4169         rcu_read_unlock();
4170
4171         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4172                 return -EFAULT;
4173         return 0;
4174 }
4175
4176 /*
4177  *      Perform a SIOCGIFCONF call. This structure will change
4178  *      size eventually, and there is nothing I can do about it.
4179  *      Thus we will need a 'compatibility mode'.
4180  */
4181
4182 static int dev_ifconf(struct net *net, char __user *arg)
4183 {
4184         struct ifconf ifc;
4185         struct net_device *dev;
4186         char __user *pos;
4187         int len;
4188         int total;
4189         int i;
4190
4191         /*
4192          *      Fetch the caller's info block.
4193          */
4194
4195         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4196                 return -EFAULT;
4197
4198         pos = ifc.ifc_buf;
4199         len = ifc.ifc_len;
4200
4201         /*
4202          *      Loop over the interfaces, and write an info block for each.
4203          */
4204
4205         total = 0;
4206         for_each_netdev(net, dev) {
4207                 for (i = 0; i < NPROTO; i++) {
4208                         if (gifconf_list[i]) {
4209                                 int done;
4210                                 if (!pos)
4211                                         done = gifconf_list[i](dev, NULL, 0);
4212                                 else
4213                                         done = gifconf_list[i](dev, pos + total,
4214                                                                len - total);
4215                                 if (done < 0)
4216                                         return -EFAULT;
4217                                 total += done;
4218                         }
4219                 }
4220         }
4221
4222         /*
4223          *      All done.  Write the updated control block back to the caller.
4224          */
4225         ifc.ifc_len = total;
4226
4227         /*
4228          *      Both BSD and Solaris return 0 here, so we do too.
4229          */
4230         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4231 }
4232
4233 #ifdef CONFIG_PROC_FS
4234
4235 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4236
4237 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4238 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4239 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4240
4241 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4242 {
4243         struct net *net = seq_file_net(seq);
4244         struct net_device *dev;
4245         struct hlist_node *p;
4246         struct hlist_head *h;
4247         unsigned int count = 0, offset = get_offset(*pos);
4248
4249         h = &net->dev_name_head[get_bucket(*pos)];
4250         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4251                 if (++count == offset)
4252                         return dev;
4253         }
4254
4255         return NULL;
4256 }
4257
4258 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4259 {
4260         struct net_device *dev;
4261         unsigned int bucket;
4262
4263         do {
4264                 dev = dev_from_same_bucket(seq, pos);
4265                 if (dev)
4266                         return dev;
4267
4268                 bucket = get_bucket(*pos) + 1;
4269                 *pos = set_bucket_offset(bucket, 1);
4270         } while (bucket < NETDEV_HASHENTRIES);
4271
4272         return NULL;
4273 }
4274
4275 /*
4276  *      This is invoked by the /proc filesystem handler to display a device
4277  *      in detail.
4278  */
4279 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4280         __acquires(RCU)
4281 {
4282         rcu_read_lock();
4283         if (!*pos)
4284                 return SEQ_START_TOKEN;
4285
4286         if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4287                 return NULL;
4288
4289         return dev_from_bucket(seq, pos);
4290 }
4291
4292 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4293 {
4294         ++*pos;
4295         return dev_from_bucket(seq, pos);
4296 }
4297
4298 void dev_seq_stop(struct seq_file *seq, void *v)
4299         __releases(RCU)
4300 {
4301         rcu_read_unlock();
4302 }
4303
4304 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4305 {
4306         struct rtnl_link_stats64 temp;
4307         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4308
4309         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4310                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4311                    dev->name, stats->rx_bytes, stats->rx_packets,
4312                    stats->rx_errors,
4313                    stats->rx_dropped + stats->rx_missed_errors,
4314                    stats->rx_fifo_errors,
4315                    stats->rx_length_errors + stats->rx_over_errors +
4316                     stats->rx_crc_errors + stats->rx_frame_errors,
4317                    stats->rx_compressed, stats->multicast,
4318                    stats->tx_bytes, stats->tx_packets,
4319                    stats->tx_errors, stats->tx_dropped,
4320                    stats->tx_fifo_errors, stats->collisions,
4321                    stats->tx_carrier_errors +
4322                     stats->tx_aborted_errors +
4323                     stats->tx_window_errors +
4324                     stats->tx_heartbeat_errors,
4325                    stats->tx_compressed);
4326 }
4327
4328 /*
4329  *      Called from the PROCfs module. This now uses the new arbitrary sized
4330  *      /proc/net interface to create /proc/net/dev
4331  */
4332 static int dev_seq_show(struct seq_file *seq, void *v)
4333 {
4334         if (v == SEQ_START_TOKEN)
4335                 seq_puts(seq, "Inter-|   Receive                            "
4336                               "                    |  Transmit\n"
4337                               " face |bytes    packets errs drop fifo frame "
4338                               "compressed multicast|bytes    packets errs "
4339                               "drop fifo colls carrier compressed\n");
4340         else
4341                 dev_seq_printf_stats(seq, v);
4342         return 0;
4343 }
4344
4345 static struct softnet_data *softnet_get_online(loff_t *pos)
4346 {
4347         struct softnet_data *sd = NULL;
4348
4349         while (*pos < nr_cpu_ids)
4350                 if (cpu_online(*pos)) {
4351                         sd = &per_cpu(softnet_data, *pos);
4352                         break;
4353                 } else
4354                         ++*pos;
4355         return sd;
4356 }
4357
4358 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4359 {
4360         return softnet_get_online(pos);
4361 }
4362
4363 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4364 {
4365         ++*pos;
4366         return softnet_get_online(pos);
4367 }
4368
4369 static void softnet_seq_stop(struct seq_file *seq, void *v)
4370 {
4371 }
4372
4373 static int softnet_seq_show(struct seq_file *seq, void *v)
4374 {
4375         struct softnet_data *sd = v;
4376
4377         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4378                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4379                    0, 0, 0, 0, /* was fastroute */
4380                    sd->cpu_collision, sd->received_rps);
4381         return 0;
4382 }
4383
4384 static const struct seq_operations dev_seq_ops = {
4385         .start = dev_seq_start,
4386         .next  = dev_seq_next,
4387         .stop  = dev_seq_stop,
4388         .show  = dev_seq_show,
4389 };
4390
4391 static int dev_seq_open(struct inode *inode, struct file *file)
4392 {
4393         return seq_open_net(inode, file, &dev_seq_ops,
4394                             sizeof(struct seq_net_private));
4395 }
4396
4397 static const struct file_operations dev_seq_fops = {
4398         .owner   = THIS_MODULE,
4399         .open    = dev_seq_open,
4400         .read    = seq_read,
4401         .llseek  = seq_lseek,
4402         .release = seq_release_net,
4403 };
4404
4405 static const struct seq_operations softnet_seq_ops = {
4406         .start = softnet_seq_start,
4407         .next  = softnet_seq_next,
4408         .stop  = softnet_seq_stop,
4409         .show  = softnet_seq_show,
4410 };
4411
4412 static int softnet_seq_open(struct inode *inode, struct file *file)
4413 {
4414         return seq_open(file, &softnet_seq_ops);
4415 }
4416
4417 static const struct file_operations softnet_seq_fops = {
4418         .owner   = THIS_MODULE,
4419         .open    = softnet_seq_open,
4420         .read    = seq_read,
4421         .llseek  = seq_lseek,
4422         .release = seq_release,
4423 };
4424
4425 static void *ptype_get_idx(loff_t pos)
4426 {
4427         struct packet_type *pt = NULL;
4428         loff_t i = 0;
4429         int t;
4430
4431         list_for_each_entry_rcu(pt, &ptype_all, list) {
4432                 if (i == pos)
4433                         return pt;
4434                 ++i;
4435         }
4436
4437         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4438                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4439                         if (i == pos)
4440                                 return pt;
4441                         ++i;
4442                 }
4443         }
4444         return NULL;
4445 }
4446
4447 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4448         __acquires(RCU)
4449 {
4450         rcu_read_lock();
4451         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4452 }
4453
4454 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4455 {
4456         struct packet_type *pt;
4457         struct list_head *nxt;
4458         int hash;
4459
4460         ++*pos;
4461         if (v == SEQ_START_TOKEN)
4462                 return ptype_get_idx(0);
4463
4464         pt = v;
4465         nxt = pt->list.next;
4466         if (pt->type == htons(ETH_P_ALL)) {
4467                 if (nxt != &ptype_all)
4468                         goto found;
4469                 hash = 0;
4470                 nxt = ptype_base[0].next;
4471         } else
4472                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4473
4474         while (nxt == &ptype_base[hash]) {
4475                 if (++hash >= PTYPE_HASH_SIZE)
4476                         return NULL;
4477                 nxt = ptype_base[hash].next;
4478         }
4479 found:
4480         return list_entry(nxt, struct packet_type, list);
4481 }
4482
4483 static void ptype_seq_stop(struct seq_file *seq, void *v)
4484         __releases(RCU)
4485 {
4486         rcu_read_unlock();
4487 }
4488
4489 static int ptype_seq_show(struct seq_file *seq, void *v)
4490 {
4491         struct packet_type *pt = v;
4492
4493         if (v == SEQ_START_TOKEN)
4494                 seq_puts(seq, "Type Device      Function\n");
4495         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4496                 if (pt->type == htons(ETH_P_ALL))
4497                         seq_puts(seq, "ALL ");
4498                 else
4499                         seq_printf(seq, "%04x", ntohs(pt->type));
4500
4501                 seq_printf(seq, " %-8s %pF\n",
4502                            pt->dev ? pt->dev->name : "", pt->func);
4503         }
4504
4505         return 0;
4506 }
4507
4508 static const struct seq_operations ptype_seq_ops = {
4509         .start = ptype_seq_start,
4510         .next  = ptype_seq_next,
4511         .stop  = ptype_seq_stop,
4512         .show  = ptype_seq_show,
4513 };
4514
4515 static int ptype_seq_open(struct inode *inode, struct file *file)
4516 {
4517         return seq_open_net(inode, file, &ptype_seq_ops,
4518                         sizeof(struct seq_net_private));
4519 }
4520
4521 static const struct file_operations ptype_seq_fops = {
4522         .owner   = THIS_MODULE,
4523         .open    = ptype_seq_open,
4524         .read    = seq_read,
4525         .llseek  = seq_lseek,
4526         .release = seq_release_net,
4527 };
4528
4529
4530 static int __net_init dev_proc_net_init(struct net *net)
4531 {
4532         int rc = -ENOMEM;
4533
4534         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4535                 goto out;
4536         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4537                 goto out_dev;
4538         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4539                 goto out_softnet;
4540
4541         if (wext_proc_init(net))
4542                 goto out_ptype;
4543         rc = 0;
4544 out:
4545         return rc;
4546 out_ptype:
4547         proc_net_remove(net, "ptype");
4548 out_softnet:
4549         proc_net_remove(net, "softnet_stat");
4550 out_dev:
4551         proc_net_remove(net, "dev");
4552         goto out;
4553 }
4554
4555 static void __net_exit dev_proc_net_exit(struct net *net)
4556 {
4557         wext_proc_exit(net);
4558
4559         proc_net_remove(net, "ptype");
4560         proc_net_remove(net, "softnet_stat");
4561         proc_net_remove(net, "dev");
4562 }
4563
4564 static struct pernet_operations __net_initdata dev_proc_ops = {
4565         .init = dev_proc_net_init,
4566         .exit = dev_proc_net_exit,
4567 };
4568
4569 static int __init dev_proc_init(void)
4570 {
4571         return register_pernet_subsys(&dev_proc_ops);
4572 }
4573 #else
4574 #define dev_proc_init() 0
4575 #endif  /* CONFIG_PROC_FS */
4576
4577
4578 /**
4579  *      netdev_set_master       -       set up master pointer
4580  *      @slave: slave device
4581  *      @master: new master device
4582  *
4583  *      Changes the master device of the slave. Pass %NULL to break the
4584  *      bonding. The caller must hold the RTNL semaphore. On a failure
4585  *      a negative errno code is returned. On success the reference counts
4586  *      are adjusted and the function returns zero.
4587  */
4588 int netdev_set_master(struct net_device *slave, struct net_device *master)
4589 {
4590         struct net_device *old = slave->master;
4591
4592         ASSERT_RTNL();
4593
4594         if (master) {
4595                 if (old)
4596                         return -EBUSY;
4597                 dev_hold(master);
4598         }
4599
4600         slave->master = master;
4601
4602         if (old)
4603                 dev_put(old);
4604         return 0;
4605 }
4606 EXPORT_SYMBOL(netdev_set_master);
4607
4608 /**
4609  *      netdev_set_bond_master  -       set up bonding master/slave pair
4610  *      @slave: slave device
4611  *      @master: new master device
4612  *
4613  *      Changes the master device of the slave. Pass %NULL to break the
4614  *      bonding. The caller must hold the RTNL semaphore. On a failure
4615  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4616  *      to the routing socket and the function returns zero.
4617  */
4618 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4619 {
4620         int err;
4621
4622         ASSERT_RTNL();
4623
4624         err = netdev_set_master(slave, master);
4625         if (err)
4626                 return err;
4627         if (master)
4628                 slave->flags |= IFF_SLAVE;
4629         else
4630                 slave->flags &= ~IFF_SLAVE;
4631
4632         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4633         return 0;
4634 }
4635 EXPORT_SYMBOL(netdev_set_bond_master);
4636
4637 static void dev_change_rx_flags(struct net_device *dev, int flags)
4638 {
4639         const struct net_device_ops *ops = dev->netdev_ops;
4640
4641         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4642                 ops->ndo_change_rx_flags(dev, flags);
4643 }
4644
4645 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4646 {
4647         unsigned int old_flags = dev->flags;
4648         kuid_t uid;
4649         kgid_t gid;
4650
4651         ASSERT_RTNL();
4652
4653         dev->flags |= IFF_PROMISC;
4654         dev->promiscuity += inc;
4655         if (dev->promiscuity == 0) {
4656                 /*
4657                  * Avoid overflow.
4658                  * If inc causes overflow, untouch promisc and return error.
4659                  */
4660                 if (inc < 0)
4661                         dev->flags &= ~IFF_PROMISC;
4662                 else {
4663                         dev->promiscuity -= inc;
4664                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4665                                 dev->name);
4666                         return -EOVERFLOW;
4667                 }
4668         }
4669         if (dev->flags != old_flags) {
4670                 pr_info("device %s %s promiscuous mode\n",
4671                         dev->name,
4672                         dev->flags & IFF_PROMISC ? "entered" : "left");
4673                 if (audit_enabled) {
4674                         current_uid_gid(&uid, &gid);
4675                         audit_log(current->audit_context, GFP_ATOMIC,
4676                                 AUDIT_ANOM_PROMISCUOUS,
4677                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4678                                 dev->name, (dev->flags & IFF_PROMISC),
4679                                 (old_flags & IFF_PROMISC),
4680                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4681                                 from_kuid(&init_user_ns, uid),
4682                                 from_kgid(&init_user_ns, gid),
4683                                 audit_get_sessionid(current));
4684                 }
4685
4686                 dev_change_rx_flags(dev, IFF_PROMISC);
4687         }
4688         return 0;
4689 }
4690
4691 /**
4692  *      dev_set_promiscuity     - update promiscuity count on a device
4693  *      @dev: device
4694  *      @inc: modifier
4695  *
4696  *      Add or remove promiscuity from a device. While the count in the device
4697  *      remains above zero the interface remains promiscuous. Once it hits zero
4698  *      the device reverts back to normal filtering operation. A negative inc
4699  *      value is used to drop promiscuity on the device.
4700  *      Return 0 if successful or a negative errno code on error.
4701  */
4702 int dev_set_promiscuity(struct net_device *dev, int inc)
4703 {
4704         unsigned int old_flags = dev->flags;
4705         int err;
4706
4707         err = __dev_set_promiscuity(dev, inc);
4708         if (err < 0)
4709                 return err;
4710         if (dev->flags != old_flags)
4711                 dev_set_rx_mode(dev);
4712         return err;
4713 }
4714 EXPORT_SYMBOL(dev_set_promiscuity);
4715
4716 /**
4717  *      dev_set_allmulti        - update allmulti count on a device
4718  *      @dev: device
4719  *      @inc: modifier
4720  *
4721  *      Add or remove reception of all multicast frames to a device. While the
4722  *      count in the device remains above zero the interface remains listening
4723  *      to all interfaces. Once it hits zero the device reverts back to normal
4724  *      filtering operation. A negative @inc value is used to drop the counter
4725  *      when releasing a resource needing all multicasts.
4726  *      Return 0 if successful or a negative errno code on error.
4727  */
4728
4729 int dev_set_allmulti(struct net_device *dev, int inc)
4730 {
4731         unsigned int old_flags = dev->flags;
4732
4733         ASSERT_RTNL();
4734
4735         dev->flags |= IFF_ALLMULTI;
4736         dev->allmulti += inc;
4737         if (dev->allmulti == 0) {
4738                 /*
4739                  * Avoid overflow.
4740                  * If inc causes overflow, untouch allmulti and return error.
4741                  */
4742                 if (inc < 0)
4743                         dev->flags &= ~IFF_ALLMULTI;
4744                 else {
4745                         dev->allmulti -= inc;
4746                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4747                                 dev->name);
4748                         return -EOVERFLOW;
4749                 }
4750         }
4751         if (dev->flags ^ old_flags) {
4752                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4753                 dev_set_rx_mode(dev);
4754         }
4755         return 0;
4756 }
4757 EXPORT_SYMBOL(dev_set_allmulti);
4758
4759 /*
4760  *      Upload unicast and multicast address lists to device and
4761  *      configure RX filtering. When the device doesn't support unicast
4762  *      filtering it is put in promiscuous mode while unicast addresses
4763  *      are present.
4764  */
4765 void __dev_set_rx_mode(struct net_device *dev)
4766 {
4767         const struct net_device_ops *ops = dev->netdev_ops;
4768
4769         /* dev_open will call this function so the list will stay sane. */
4770         if (!(dev->flags&IFF_UP))
4771                 return;
4772
4773         if (!netif_device_present(dev))
4774                 return;
4775
4776         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4777                 /* Unicast addresses changes may only happen under the rtnl,
4778                  * therefore calling __dev_set_promiscuity here is safe.
4779                  */
4780                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4781                         __dev_set_promiscuity(dev, 1);
4782                         dev->uc_promisc = true;
4783                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4784                         __dev_set_promiscuity(dev, -1);
4785                         dev->uc_promisc = false;
4786                 }
4787         }
4788
4789         if (ops->ndo_set_rx_mode)
4790                 ops->ndo_set_rx_mode(dev);
4791 }
4792
4793 void dev_set_rx_mode(struct net_device *dev)
4794 {
4795         netif_addr_lock_bh(dev);
4796         __dev_set_rx_mode(dev);
4797         netif_addr_unlock_bh(dev);
4798 }
4799
4800 /**
4801  *      dev_get_flags - get flags reported to userspace
4802  *      @dev: device
4803  *
4804  *      Get the combination of flag bits exported through APIs to userspace.
4805  */
4806 unsigned int dev_get_flags(const struct net_device *dev)
4807 {
4808         unsigned int flags;
4809
4810         flags = (dev->flags & ~(IFF_PROMISC |
4811                                 IFF_ALLMULTI |
4812                                 IFF_RUNNING |
4813                                 IFF_LOWER_UP |
4814                                 IFF_DORMANT)) |
4815                 (dev->gflags & (IFF_PROMISC |
4816                                 IFF_ALLMULTI));
4817
4818         if (netif_running(dev)) {
4819                 if (netif_oper_up(dev))
4820                         flags |= IFF_RUNNING;
4821                 if (netif_carrier_ok(dev))
4822                         flags |= IFF_LOWER_UP;
4823                 if (netif_dormant(dev))
4824                         flags |= IFF_DORMANT;
4825         }
4826
4827         return flags;
4828 }
4829 EXPORT_SYMBOL(dev_get_flags);
4830
4831 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4832 {
4833         unsigned int old_flags = dev->flags;
4834         int ret;
4835
4836         ASSERT_RTNL();
4837
4838         /*
4839          *      Set the flags on our device.
4840          */
4841
4842         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4843                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4844                                IFF_AUTOMEDIA)) |
4845                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4846                                     IFF_ALLMULTI));
4847
4848         /*
4849          *      Load in the correct multicast list now the flags have changed.
4850          */
4851
4852         if ((old_flags ^ flags) & IFF_MULTICAST)
4853                 dev_change_rx_flags(dev, IFF_MULTICAST);
4854
4855         dev_set_rx_mode(dev);
4856
4857         /*
4858          *      Have we downed the interface. We handle IFF_UP ourselves
4859          *      according to user attempts to set it, rather than blindly
4860          *      setting it.
4861          */
4862
4863         ret = 0;
4864         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4865                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4866
4867                 if (!ret)
4868                         dev_set_rx_mode(dev);
4869         }
4870
4871         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4872                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4873
4874                 dev->gflags ^= IFF_PROMISC;
4875                 dev_set_promiscuity(dev, inc);
4876         }
4877
4878         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4879            is important. Some (broken) drivers set IFF_PROMISC, when
4880            IFF_ALLMULTI is requested not asking us and not reporting.
4881          */
4882         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4883                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4884
4885                 dev->gflags ^= IFF_ALLMULTI;
4886                 dev_set_allmulti(dev, inc);
4887         }
4888
4889         return ret;
4890 }
4891
4892 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4893 {
4894         unsigned int changes = dev->flags ^ old_flags;
4895
4896         if (changes & IFF_UP) {
4897                 if (dev->flags & IFF_UP)
4898                         call_netdevice_notifiers(NETDEV_UP, dev);
4899                 else
4900                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4901         }
4902
4903         if (dev->flags & IFF_UP &&
4904             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4905                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4906 }
4907
4908 /**
4909  *      dev_change_flags - change device settings
4910  *      @dev: device
4911  *      @flags: device state flags
4912  *
4913  *      Change settings on device based state flags. The flags are
4914  *      in the userspace exported format.
4915  */
4916 int dev_change_flags(struct net_device *dev, unsigned int flags)
4917 {
4918         int ret;
4919         unsigned int changes, old_flags = dev->flags;
4920
4921         ret = __dev_change_flags(dev, flags);
4922         if (ret < 0)
4923                 return ret;
4924
4925         changes = old_flags ^ dev->flags;
4926         if (changes)
4927                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4928
4929         __dev_notify_flags(dev, old_flags);
4930         return ret;
4931 }
4932 EXPORT_SYMBOL(dev_change_flags);
4933
4934 /**
4935  *      dev_set_mtu - Change maximum transfer unit
4936  *      @dev: device
4937  *      @new_mtu: new transfer unit
4938  *
4939  *      Change the maximum transfer size of the network device.
4940  */
4941 int dev_set_mtu(struct net_device *dev, int new_mtu)
4942 {
4943         const struct net_device_ops *ops = dev->netdev_ops;
4944         int err;
4945
4946         if (new_mtu == dev->mtu)
4947                 return 0;
4948
4949         /*      MTU must be positive.    */
4950         if (new_mtu < 0)
4951                 return -EINVAL;
4952
4953         if (!netif_device_present(dev))
4954                 return -ENODEV;
4955
4956         err = 0;
4957         if (ops->ndo_change_mtu)
4958                 err = ops->ndo_change_mtu(dev, new_mtu);
4959         else
4960                 dev->mtu = new_mtu;
4961
4962         if (!err && dev->flags & IFF_UP)
4963                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4964         return err;
4965 }
4966 EXPORT_SYMBOL(dev_set_mtu);
4967
4968 /**
4969  *      dev_set_group - Change group this device belongs to
4970  *      @dev: device
4971  *      @new_group: group this device should belong to
4972  */
4973 void dev_set_group(struct net_device *dev, int new_group)
4974 {
4975         dev->group = new_group;
4976 }
4977 EXPORT_SYMBOL(dev_set_group);
4978
4979 /**
4980  *      dev_set_mac_address - Change Media Access Control Address
4981  *      @dev: device
4982  *      @sa: new address
4983  *
4984  *      Change the hardware (MAC) address of the device
4985  */
4986 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4987 {
4988         const struct net_device_ops *ops = dev->netdev_ops;
4989         int err;
4990
4991         if (!ops->ndo_set_mac_address)
4992                 return -EOPNOTSUPP;
4993         if (sa->sa_family != dev->type)
4994                 return -EINVAL;
4995         if (!netif_device_present(dev))
4996                 return -ENODEV;
4997         err = ops->ndo_set_mac_address(dev, sa);
4998         if (!err)
4999                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5000         add_device_randomness(dev->dev_addr, dev->addr_len);
5001         return err;
5002 }
5003 EXPORT_SYMBOL(dev_set_mac_address);
5004
5005 /*
5006  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5007  */
5008 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5009 {
5010         int err;
5011         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5012
5013         if (!dev)
5014                 return -ENODEV;
5015
5016         switch (cmd) {
5017         case SIOCGIFFLAGS:      /* Get interface flags */
5018                 ifr->ifr_flags = (short) dev_get_flags(dev);
5019                 return 0;
5020
5021         case SIOCGIFMETRIC:     /* Get the metric on the interface
5022                                    (currently unused) */
5023                 ifr->ifr_metric = 0;
5024                 return 0;
5025
5026         case SIOCGIFMTU:        /* Get the MTU of a device */
5027                 ifr->ifr_mtu = dev->mtu;
5028                 return 0;
5029
5030         case SIOCGIFHWADDR:
5031                 if (!dev->addr_len)
5032                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5033                 else
5034                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5035                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5036                 ifr->ifr_hwaddr.sa_family = dev->type;
5037                 return 0;
5038
5039         case SIOCGIFSLAVE:
5040                 err = -EINVAL;
5041                 break;
5042
5043         case SIOCGIFMAP:
5044                 ifr->ifr_map.mem_start = dev->mem_start;
5045                 ifr->ifr_map.mem_end   = dev->mem_end;
5046                 ifr->ifr_map.base_addr = dev->base_addr;
5047                 ifr->ifr_map.irq       = dev->irq;
5048                 ifr->ifr_map.dma       = dev->dma;
5049                 ifr->ifr_map.port      = dev->if_port;
5050                 return 0;
5051
5052         case SIOCGIFINDEX:
5053                 ifr->ifr_ifindex = dev->ifindex;
5054                 return 0;
5055
5056         case SIOCGIFTXQLEN:
5057                 ifr->ifr_qlen = dev->tx_queue_len;
5058                 return 0;
5059
5060         default:
5061                 /* dev_ioctl() should ensure this case
5062                  * is never reached
5063                  */
5064                 WARN_ON(1);
5065                 err = -ENOTTY;
5066                 break;
5067
5068         }
5069         return err;
5070 }
5071
5072 /*
5073  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
5074  */
5075 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5076 {
5077         int err;
5078         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5079         const struct net_device_ops *ops;
5080
5081         if (!dev)
5082                 return -ENODEV;
5083
5084         ops = dev->netdev_ops;
5085
5086         switch (cmd) {
5087         case SIOCSIFFLAGS:      /* Set interface flags */
5088                 return dev_change_flags(dev, ifr->ifr_flags);
5089
5090         case SIOCSIFMETRIC:     /* Set the metric on the interface
5091                                    (currently unused) */
5092                 return -EOPNOTSUPP;
5093
5094         case SIOCSIFMTU:        /* Set the MTU of a device */
5095                 return dev_set_mtu(dev, ifr->ifr_mtu);
5096
5097         case SIOCSIFHWADDR:
5098                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5099
5100         case SIOCSIFHWBROADCAST:
5101                 if (ifr->ifr_hwaddr.sa_family != dev->type)
5102                         return -EINVAL;
5103                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5104                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5105                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5106                 return 0;
5107
5108         case SIOCSIFMAP:
5109                 if (ops->ndo_set_config) {
5110                         if (!netif_device_present(dev))
5111                                 return -ENODEV;
5112                         return ops->ndo_set_config(dev, &ifr->ifr_map);
5113                 }
5114                 return -EOPNOTSUPP;
5115
5116         case SIOCADDMULTI:
5117                 if (!ops->ndo_set_rx_mode ||
5118                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5119                         return -EINVAL;
5120                 if (!netif_device_present(dev))
5121                         return -ENODEV;
5122                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5123
5124         case SIOCDELMULTI:
5125                 if (!ops->ndo_set_rx_mode ||
5126                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5127                         return -EINVAL;
5128                 if (!netif_device_present(dev))
5129                         return -ENODEV;
5130                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5131
5132         case SIOCSIFTXQLEN:
5133                 if (ifr->ifr_qlen < 0)
5134                         return -EINVAL;
5135                 dev->tx_queue_len = ifr->ifr_qlen;
5136                 return 0;
5137
5138         case SIOCSIFNAME:
5139                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5140                 return dev_change_name(dev, ifr->ifr_newname);
5141
5142         case SIOCSHWTSTAMP:
5143                 err = net_hwtstamp_validate(ifr);
5144                 if (err)
5145                         return err;
5146                 /* fall through */
5147
5148         /*
5149          *      Unknown or private ioctl
5150          */
5151         default:
5152                 if ((cmd >= SIOCDEVPRIVATE &&
5153                     cmd <= SIOCDEVPRIVATE + 15) ||
5154                     cmd == SIOCBONDENSLAVE ||
5155                     cmd == SIOCBONDRELEASE ||
5156                     cmd == SIOCBONDSETHWADDR ||
5157                     cmd == SIOCBONDSLAVEINFOQUERY ||
5158                     cmd == SIOCBONDINFOQUERY ||
5159                     cmd == SIOCBONDCHANGEACTIVE ||
5160                     cmd == SIOCGMIIPHY ||
5161                     cmd == SIOCGMIIREG ||
5162                     cmd == SIOCSMIIREG ||
5163                     cmd == SIOCBRADDIF ||
5164                     cmd == SIOCBRDELIF ||
5165                     cmd == SIOCSHWTSTAMP ||
5166                     cmd == SIOCWANDEV) {
5167                         err = -EOPNOTSUPP;
5168                         if (ops->ndo_do_ioctl) {
5169                                 if (netif_device_present(dev))
5170                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5171                                 else
5172                                         err = -ENODEV;
5173                         }
5174                 } else
5175                         err = -EINVAL;
5176
5177         }
5178         return err;
5179 }
5180
5181 /*
5182  *      This function handles all "interface"-type I/O control requests. The actual
5183  *      'doing' part of this is dev_ifsioc above.
5184  */
5185
5186 /**
5187  *      dev_ioctl       -       network device ioctl
5188  *      @net: the applicable net namespace
5189  *      @cmd: command to issue
5190  *      @arg: pointer to a struct ifreq in user space
5191  *
5192  *      Issue ioctl functions to devices. This is normally called by the
5193  *      user space syscall interfaces but can sometimes be useful for
5194  *      other purposes. The return value is the return from the syscall if
5195  *      positive or a negative errno code on error.
5196  */
5197
5198 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5199 {
5200         struct ifreq ifr;
5201         int ret;
5202         char *colon;
5203
5204         /* One special case: SIOCGIFCONF takes ifconf argument
5205            and requires shared lock, because it sleeps writing
5206            to user space.
5207          */
5208
5209         if (cmd == SIOCGIFCONF) {
5210                 rtnl_lock();
5211                 ret = dev_ifconf(net, (char __user *) arg);
5212                 rtnl_unlock();
5213                 return ret;
5214         }
5215         if (cmd == SIOCGIFNAME)
5216                 return dev_ifname(net, (struct ifreq __user *)arg);
5217
5218         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5219                 return -EFAULT;
5220
5221         ifr.ifr_name[IFNAMSIZ-1] = 0;
5222
5223         colon = strchr(ifr.ifr_name, ':');
5224         if (colon)
5225                 *colon = 0;
5226
5227         /*
5228          *      See which interface the caller is talking about.
5229          */
5230
5231         switch (cmd) {
5232         /*
5233          *      These ioctl calls:
5234          *      - can be done by all.
5235          *      - atomic and do not require locking.
5236          *      - return a value
5237          */
5238         case SIOCGIFFLAGS:
5239         case SIOCGIFMETRIC:
5240         case SIOCGIFMTU:
5241         case SIOCGIFHWADDR:
5242         case SIOCGIFSLAVE:
5243         case SIOCGIFMAP:
5244         case SIOCGIFINDEX:
5245         case SIOCGIFTXQLEN:
5246                 dev_load(net, ifr.ifr_name);
5247                 rcu_read_lock();
5248                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5249                 rcu_read_unlock();
5250                 if (!ret) {
5251                         if (colon)
5252                                 *colon = ':';
5253                         if (copy_to_user(arg, &ifr,
5254                                          sizeof(struct ifreq)))
5255                                 ret = -EFAULT;
5256                 }
5257                 return ret;
5258
5259         case SIOCETHTOOL:
5260                 dev_load(net, ifr.ifr_name);
5261                 rtnl_lock();
5262                 ret = dev_ethtool(net, &ifr);
5263                 rtnl_unlock();
5264                 if (!ret) {
5265                         if (colon)
5266                                 *colon = ':';
5267                         if (copy_to_user(arg, &ifr,
5268                                          sizeof(struct ifreq)))
5269                                 ret = -EFAULT;
5270                 }
5271                 return ret;
5272
5273         /*
5274          *      These ioctl calls:
5275          *      - require superuser power.
5276          *      - require strict serialization.
5277          *      - return a value
5278          */
5279         case SIOCGMIIPHY:
5280         case SIOCGMIIREG:
5281         case SIOCSIFNAME:
5282                 if (!capable(CAP_NET_ADMIN))
5283                         return -EPERM;
5284                 dev_load(net, ifr.ifr_name);
5285                 rtnl_lock();
5286                 ret = dev_ifsioc(net, &ifr, cmd);
5287                 rtnl_unlock();
5288                 if (!ret) {
5289                         if (colon)
5290                                 *colon = ':';
5291                         if (copy_to_user(arg, &ifr,
5292                                          sizeof(struct ifreq)))
5293                                 ret = -EFAULT;
5294                 }
5295                 return ret;
5296
5297         /*
5298          *      These ioctl calls:
5299          *      - require superuser power.
5300          *      - require strict serialization.
5301          *      - do not return a value
5302          */
5303         case SIOCSIFFLAGS:
5304         case SIOCSIFMETRIC:
5305         case SIOCSIFMTU:
5306         case SIOCSIFMAP:
5307         case SIOCSIFHWADDR:
5308         case SIOCSIFSLAVE:
5309         case SIOCADDMULTI:
5310         case SIOCDELMULTI:
5311         case SIOCSIFHWBROADCAST:
5312         case SIOCSIFTXQLEN:
5313         case SIOCSMIIREG:
5314         case SIOCBONDENSLAVE:
5315         case SIOCBONDRELEASE:
5316         case SIOCBONDSETHWADDR:
5317         case SIOCBONDCHANGEACTIVE:
5318         case SIOCBRADDIF:
5319         case SIOCBRDELIF:
5320         case SIOCSHWTSTAMP:
5321                 if (!capable(CAP_NET_ADMIN))
5322                         return -EPERM;
5323                 /* fall through */
5324         case SIOCBONDSLAVEINFOQUERY:
5325         case SIOCBONDINFOQUERY:
5326                 dev_load(net, ifr.ifr_name);
5327                 rtnl_lock();
5328                 ret = dev_ifsioc(net, &ifr, cmd);
5329                 rtnl_unlock();
5330                 return ret;
5331
5332         case SIOCGIFMEM:
5333                 /* Get the per device memory space. We can add this but
5334                  * currently do not support it */
5335         case SIOCSIFMEM:
5336                 /* Set the per device memory buffer space.
5337                  * Not applicable in our case */
5338         case SIOCSIFLINK:
5339                 return -ENOTTY;
5340
5341         /*
5342          *      Unknown or private ioctl.
5343          */
5344         default:
5345                 if (cmd == SIOCWANDEV ||
5346                     (cmd >= SIOCDEVPRIVATE &&
5347                      cmd <= SIOCDEVPRIVATE + 15)) {
5348                         dev_load(net, ifr.ifr_name);
5349                         rtnl_lock();
5350                         ret = dev_ifsioc(net, &ifr, cmd);
5351                         rtnl_unlock();
5352                         if (!ret && copy_to_user(arg, &ifr,
5353                                                  sizeof(struct ifreq)))
5354                                 ret = -EFAULT;
5355                         return ret;
5356                 }
5357                 /* Take care of Wireless Extensions */
5358                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5359                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5360                 return -ENOTTY;
5361         }
5362 }
5363
5364
5365 /**
5366  *      dev_new_index   -       allocate an ifindex
5367  *      @net: the applicable net namespace
5368  *
5369  *      Returns a suitable unique value for a new device interface
5370  *      number.  The caller must hold the rtnl semaphore or the
5371  *      dev_base_lock to be sure it remains unique.
5372  */
5373 static int dev_new_index(struct net *net)
5374 {
5375         int ifindex = net->ifindex;
5376         for (;;) {
5377                 if (++ifindex <= 0)
5378                         ifindex = 1;
5379                 if (!__dev_get_by_index(net, ifindex))
5380                         return net->ifindex = ifindex;
5381         }
5382 }
5383
5384 /* Delayed registration/unregisteration */
5385 static LIST_HEAD(net_todo_list);
5386
5387 static void net_set_todo(struct net_device *dev)
5388 {
5389         list_add_tail(&dev->todo_list, &net_todo_list);
5390 }
5391
5392 static void rollback_registered_many(struct list_head *head)
5393 {
5394         struct net_device *dev, *tmp;
5395
5396         BUG_ON(dev_boot_phase);
5397         ASSERT_RTNL();
5398
5399         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5400                 /* Some devices call without registering
5401                  * for initialization unwind. Remove those
5402                  * devices and proceed with the remaining.
5403                  */
5404                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5405                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5406                                  dev->name, dev);
5407
5408                         WARN_ON(1);
5409                         list_del(&dev->unreg_list);
5410                         continue;
5411                 }
5412                 dev->dismantle = true;
5413                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5414         }
5415
5416         /* If device is running, close it first. */
5417         dev_close_many(head);
5418
5419         list_for_each_entry(dev, head, unreg_list) {
5420                 /* And unlink it from device chain. */
5421                 unlist_netdevice(dev);
5422
5423                 dev->reg_state = NETREG_UNREGISTERING;
5424         }
5425
5426         synchronize_net();
5427
5428         list_for_each_entry(dev, head, unreg_list) {
5429                 /* Shutdown queueing discipline. */
5430                 dev_shutdown(dev);
5431
5432
5433                 /* Notify protocols, that we are about to destroy
5434                    this device. They should clean all the things.
5435                 */
5436                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5437
5438                 if (!dev->rtnl_link_ops ||
5439                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5440                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5441
5442                 /*
5443                  *      Flush the unicast and multicast chains
5444                  */
5445                 dev_uc_flush(dev);
5446                 dev_mc_flush(dev);
5447
5448                 if (dev->netdev_ops->ndo_uninit)
5449                         dev->netdev_ops->ndo_uninit(dev);
5450
5451                 /* Notifier chain MUST detach us from master device. */
5452                 WARN_ON(dev->master);
5453
5454                 /* Remove entries from kobject tree */
5455                 netdev_unregister_kobject(dev);
5456         }
5457
5458         synchronize_net();
5459
5460         list_for_each_entry(dev, head, unreg_list)
5461                 dev_put(dev);
5462 }
5463
5464 static void rollback_registered(struct net_device *dev)
5465 {
5466         LIST_HEAD(single);
5467
5468         list_add(&dev->unreg_list, &single);
5469         rollback_registered_many(&single);
5470         list_del(&single);
5471 }
5472
5473 static netdev_features_t netdev_fix_features(struct net_device *dev,
5474         netdev_features_t features)
5475 {
5476         /* Fix illegal checksum combinations */
5477         if ((features & NETIF_F_HW_CSUM) &&
5478             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5479                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5480                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5481         }
5482
5483         /* Fix illegal SG+CSUM combinations. */
5484         if ((features & NETIF_F_SG) &&
5485             !(features & NETIF_F_ALL_CSUM)) {
5486                 netdev_dbg(dev,
5487                         "Dropping NETIF_F_SG since no checksum feature.\n");
5488                 features &= ~NETIF_F_SG;
5489         }
5490
5491         /* TSO requires that SG is present as well. */
5492         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5493                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5494                 features &= ~NETIF_F_ALL_TSO;
5495         }
5496
5497         /* TSO ECN requires that TSO is present as well. */
5498         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5499                 features &= ~NETIF_F_TSO_ECN;
5500
5501         /* Software GSO depends on SG. */
5502         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5503                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5504                 features &= ~NETIF_F_GSO;
5505         }
5506
5507         /* UFO needs SG and checksumming */
5508         if (features & NETIF_F_UFO) {
5509                 /* maybe split UFO into V4 and V6? */
5510                 if (!((features & NETIF_F_GEN_CSUM) ||
5511                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5512                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5513                         netdev_dbg(dev,
5514                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5515                         features &= ~NETIF_F_UFO;
5516                 }
5517
5518                 if (!(features & NETIF_F_SG)) {
5519                         netdev_dbg(dev,
5520                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5521                         features &= ~NETIF_F_UFO;
5522                 }
5523         }
5524
5525         return features;
5526 }
5527
5528 int __netdev_update_features(struct net_device *dev)
5529 {
5530         netdev_features_t features;
5531         int err = 0;
5532
5533         ASSERT_RTNL();
5534
5535         features = netdev_get_wanted_features(dev);
5536
5537         if (dev->netdev_ops->ndo_fix_features)
5538                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5539
5540         /* driver might be less strict about feature dependencies */
5541         features = netdev_fix_features(dev, features);
5542
5543         if (dev->features == features)
5544                 return 0;
5545
5546         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5547                 &dev->features, &features);
5548
5549         if (dev->netdev_ops->ndo_set_features)
5550                 err = dev->netdev_ops->ndo_set_features(dev, features);
5551
5552         if (unlikely(err < 0)) {
5553                 netdev_err(dev,
5554                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5555                         err, &features, &dev->features);
5556                 return -1;
5557         }
5558
5559         if (!err)
5560                 dev->features = features;
5561
5562         return 1;
5563 }
5564
5565 /**
5566  *      netdev_update_features - recalculate device features
5567  *      @dev: the device to check
5568  *
5569  *      Recalculate dev->features set and send notifications if it
5570  *      has changed. Should be called after driver or hardware dependent
5571  *      conditions might have changed that influence the features.
5572  */
5573 void netdev_update_features(struct net_device *dev)
5574 {
5575         if (__netdev_update_features(dev))
5576                 netdev_features_change(dev);
5577 }
5578 EXPORT_SYMBOL(netdev_update_features);
5579
5580 /**
5581  *      netdev_change_features - recalculate device features
5582  *      @dev: the device to check
5583  *
5584  *      Recalculate dev->features set and send notifications even
5585  *      if they have not changed. Should be called instead of
5586  *      netdev_update_features() if also dev->vlan_features might
5587  *      have changed to allow the changes to be propagated to stacked
5588  *      VLAN devices.
5589  */
5590 void netdev_change_features(struct net_device *dev)
5591 {
5592         __netdev_update_features(dev);
5593         netdev_features_change(dev);
5594 }
5595 EXPORT_SYMBOL(netdev_change_features);
5596
5597 /**
5598  *      netif_stacked_transfer_operstate -      transfer operstate
5599  *      @rootdev: the root or lower level device to transfer state from
5600  *      @dev: the device to transfer operstate to
5601  *
5602  *      Transfer operational state from root to device. This is normally
5603  *      called when a stacking relationship exists between the root
5604  *      device and the device(a leaf device).
5605  */
5606 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5607                                         struct net_device *dev)
5608 {
5609         if (rootdev->operstate == IF_OPER_DORMANT)
5610                 netif_dormant_on(dev);
5611         else
5612                 netif_dormant_off(dev);
5613
5614         if (netif_carrier_ok(rootdev)) {
5615                 if (!netif_carrier_ok(dev))
5616                         netif_carrier_on(dev);
5617         } else {
5618                 if (netif_carrier_ok(dev))
5619                         netif_carrier_off(dev);
5620         }
5621 }
5622 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5623
5624 #ifdef CONFIG_RPS
5625 static int netif_alloc_rx_queues(struct net_device *dev)
5626 {
5627         unsigned int i, count = dev->num_rx_queues;
5628         struct netdev_rx_queue *rx;
5629
5630         BUG_ON(count < 1);
5631
5632         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5633         if (!rx) {
5634                 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5635                 return -ENOMEM;
5636         }
5637         dev->_rx = rx;
5638
5639         for (i = 0; i < count; i++)
5640                 rx[i].dev = dev;
5641         return 0;
5642 }
5643 #endif
5644
5645 static void netdev_init_one_queue(struct net_device *dev,
5646                                   struct netdev_queue *queue, void *_unused)
5647 {
5648         /* Initialize queue lock */
5649         spin_lock_init(&queue->_xmit_lock);
5650         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5651         queue->xmit_lock_owner = -1;
5652         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5653         queue->dev = dev;
5654 #ifdef CONFIG_BQL
5655         dql_init(&queue->dql, HZ);
5656 #endif
5657 }
5658
5659 static int netif_alloc_netdev_queues(struct net_device *dev)
5660 {
5661         unsigned int count = dev->num_tx_queues;
5662         struct netdev_queue *tx;
5663
5664         BUG_ON(count < 1);
5665
5666         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5667         if (!tx) {
5668                 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5669                 return -ENOMEM;
5670         }
5671         dev->_tx = tx;
5672
5673         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5674         spin_lock_init(&dev->tx_global_lock);
5675
5676         return 0;
5677 }
5678
5679 /**
5680  *      register_netdevice      - register a network device
5681  *      @dev: device to register
5682  *
5683  *      Take a completed network device structure and add it to the kernel
5684  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5685  *      chain. 0 is returned on success. A negative errno code is returned
5686  *      on a failure to set up the device, or if the name is a duplicate.
5687  *
5688  *      Callers must hold the rtnl semaphore. You may want
5689  *      register_netdev() instead of this.
5690  *
5691  *      BUGS:
5692  *      The locking appears insufficient to guarantee two parallel registers
5693  *      will not get the same name.
5694  */
5695
5696 int register_netdevice(struct net_device *dev)
5697 {
5698         int ret;
5699         struct net *net = dev_net(dev);
5700
5701         BUG_ON(dev_boot_phase);
5702         ASSERT_RTNL();
5703
5704         might_sleep();
5705
5706         /* When net_device's are persistent, this will be fatal. */
5707         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5708         BUG_ON(!net);
5709
5710         spin_lock_init(&dev->addr_list_lock);
5711         netdev_set_addr_lockdep_class(dev);
5712
5713         dev->iflink = -1;
5714
5715         ret = dev_get_valid_name(net, dev, dev->name);
5716         if (ret < 0)
5717                 goto out;
5718
5719         /* Init, if this function is available */
5720         if (dev->netdev_ops->ndo_init) {
5721                 ret = dev->netdev_ops->ndo_init(dev);
5722                 if (ret) {
5723                         if (ret > 0)
5724                                 ret = -EIO;
5725                         goto out;
5726                 }
5727         }
5728
5729         ret = -EBUSY;
5730         if (!dev->ifindex)
5731                 dev->ifindex = dev_new_index(net);
5732         else if (__dev_get_by_index(net, dev->ifindex))
5733                 goto err_uninit;
5734
5735         if (dev->iflink == -1)
5736                 dev->iflink = dev->ifindex;
5737
5738         /* Transfer changeable features to wanted_features and enable
5739          * software offloads (GSO and GRO).
5740          */
5741         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5742         dev->features |= NETIF_F_SOFT_FEATURES;
5743         dev->wanted_features = dev->features & dev->hw_features;
5744
5745         /* Turn on no cache copy if HW is doing checksum */
5746         if (!(dev->flags & IFF_LOOPBACK)) {
5747                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5748                 if (dev->features & NETIF_F_ALL_CSUM) {
5749                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5750                         dev->features |= NETIF_F_NOCACHE_COPY;
5751                 }
5752         }
5753
5754         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5755          */
5756         dev->vlan_features |= NETIF_F_HIGHDMA;
5757
5758         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5759         ret = notifier_to_errno(ret);
5760         if (ret)
5761                 goto err_uninit;
5762
5763         ret = netdev_register_kobject(dev);
5764         if (ret)
5765                 goto err_uninit;
5766         dev->reg_state = NETREG_REGISTERED;
5767
5768         __netdev_update_features(dev);
5769
5770         /*
5771          *      Default initial state at registry is that the
5772          *      device is present.
5773          */
5774
5775         set_bit(__LINK_STATE_PRESENT, &dev->state);
5776
5777         linkwatch_init_dev(dev);
5778
5779         dev_init_scheduler(dev);
5780         dev_hold(dev);
5781         list_netdevice(dev);
5782         add_device_randomness(dev->dev_addr, dev->addr_len);
5783
5784         /* Notify protocols, that a new device appeared. */
5785         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5786         ret = notifier_to_errno(ret);
5787         if (ret) {
5788                 rollback_registered(dev);
5789                 dev->reg_state = NETREG_UNREGISTERED;
5790         }
5791         /*
5792          *      Prevent userspace races by waiting until the network
5793          *      device is fully setup before sending notifications.
5794          */
5795         if (!dev->rtnl_link_ops ||
5796             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5797                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5798
5799 out:
5800         return ret;
5801
5802 err_uninit:
5803         if (dev->netdev_ops->ndo_uninit)
5804                 dev->netdev_ops->ndo_uninit(dev);
5805         goto out;
5806 }
5807 EXPORT_SYMBOL(register_netdevice);
5808
5809 /**
5810  *      init_dummy_netdev       - init a dummy network device for NAPI
5811  *      @dev: device to init
5812  *
5813  *      This takes a network device structure and initialize the minimum
5814  *      amount of fields so it can be used to schedule NAPI polls without
5815  *      registering a full blown interface. This is to be used by drivers
5816  *      that need to tie several hardware interfaces to a single NAPI
5817  *      poll scheduler due to HW limitations.
5818  */
5819 int init_dummy_netdev(struct net_device *dev)
5820 {
5821         /* Clear everything. Note we don't initialize spinlocks
5822          * are they aren't supposed to be taken by any of the
5823          * NAPI code and this dummy netdev is supposed to be
5824          * only ever used for NAPI polls
5825          */
5826         memset(dev, 0, sizeof(struct net_device));
5827
5828         /* make sure we BUG if trying to hit standard
5829          * register/unregister code path
5830          */
5831         dev->reg_state = NETREG_DUMMY;
5832
5833         /* NAPI wants this */
5834         INIT_LIST_HEAD(&dev->napi_list);
5835
5836         /* a dummy interface is started by default */
5837         set_bit(__LINK_STATE_PRESENT, &dev->state);
5838         set_bit(__LINK_STATE_START, &dev->state);
5839
5840         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5841          * because users of this 'device' dont need to change
5842          * its refcount.
5843          */
5844
5845         return 0;
5846 }
5847 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5848
5849
5850 /**
5851  *      register_netdev - register a network device
5852  *      @dev: device to register
5853  *
5854  *      Take a completed network device structure and add it to the kernel
5855  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5856  *      chain. 0 is returned on success. A negative errno code is returned
5857  *      on a failure to set up the device, or if the name is a duplicate.
5858  *
5859  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5860  *      and expands the device name if you passed a format string to
5861  *      alloc_netdev.
5862  */
5863 int register_netdev(struct net_device *dev)
5864 {
5865         int err;
5866
5867         rtnl_lock();
5868         err = register_netdevice(dev);
5869         rtnl_unlock();
5870         return err;
5871 }
5872 EXPORT_SYMBOL(register_netdev);
5873
5874 int netdev_refcnt_read(const struct net_device *dev)
5875 {
5876         int i, refcnt = 0;
5877
5878         for_each_possible_cpu(i)
5879                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5880         return refcnt;
5881 }
5882 EXPORT_SYMBOL(netdev_refcnt_read);
5883
5884 /**
5885  * netdev_wait_allrefs - wait until all references are gone.
5886  * @dev: target net_device
5887  *
5888  * This is called when unregistering network devices.
5889  *
5890  * Any protocol or device that holds a reference should register
5891  * for netdevice notification, and cleanup and put back the
5892  * reference if they receive an UNREGISTER event.
5893  * We can get stuck here if buggy protocols don't correctly
5894  * call dev_put.
5895  */
5896 static void netdev_wait_allrefs(struct net_device *dev)
5897 {
5898         unsigned long rebroadcast_time, warning_time;
5899         int refcnt;
5900
5901         linkwatch_forget_dev(dev);
5902
5903         rebroadcast_time = warning_time = jiffies;
5904         refcnt = netdev_refcnt_read(dev);
5905
5906         while (refcnt != 0) {
5907                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5908                         rtnl_lock();
5909
5910                         /* Rebroadcast unregister notification */
5911                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5912
5913                         __rtnl_unlock();
5914                         rcu_barrier();
5915                         rtnl_lock();
5916
5917                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5918                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5919                                      &dev->state)) {
5920                                 /* We must not have linkwatch events
5921                                  * pending on unregister. If this
5922                                  * happens, we simply run the queue
5923                                  * unscheduled, resulting in a noop
5924                                  * for this device.
5925                                  */
5926                                 linkwatch_run_queue();
5927                         }
5928
5929                         __rtnl_unlock();
5930
5931                         rebroadcast_time = jiffies;
5932                 }
5933
5934                 msleep(250);
5935
5936                 refcnt = netdev_refcnt_read(dev);
5937
5938                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5939                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5940                                  dev->name, refcnt);
5941                         warning_time = jiffies;
5942                 }
5943         }
5944 }
5945
5946 /* The sequence is:
5947  *
5948  *      rtnl_lock();
5949  *      ...
5950  *      register_netdevice(x1);
5951  *      register_netdevice(x2);
5952  *      ...
5953  *      unregister_netdevice(y1);
5954  *      unregister_netdevice(y2);
5955  *      ...
5956  *      rtnl_unlock();
5957  *      free_netdev(y1);
5958  *      free_netdev(y2);
5959  *
5960  * We are invoked by rtnl_unlock().
5961  * This allows us to deal with problems:
5962  * 1) We can delete sysfs objects which invoke hotplug
5963  *    without deadlocking with linkwatch via keventd.
5964  * 2) Since we run with the RTNL semaphore not held, we can sleep
5965  *    safely in order to wait for the netdev refcnt to drop to zero.
5966  *
5967  * We must not return until all unregister events added during
5968  * the interval the lock was held have been completed.
5969  */
5970 void netdev_run_todo(void)
5971 {
5972         struct list_head list;
5973
5974         /* Snapshot list, allow later requests */
5975         list_replace_init(&net_todo_list, &list);
5976
5977         __rtnl_unlock();
5978
5979
5980         /* Wait for rcu callbacks to finish before next phase */
5981         if (!list_empty(&list))
5982                 rcu_barrier();
5983
5984         while (!list_empty(&list)) {
5985                 struct net_device *dev
5986                         = list_first_entry(&list, struct net_device, todo_list);
5987                 list_del(&dev->todo_list);
5988
5989                 rtnl_lock();
5990                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5991                 __rtnl_unlock();
5992
5993                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5994                         pr_err("network todo '%s' but state %d\n",
5995                                dev->name, dev->reg_state);
5996                         dump_stack();
5997                         continue;
5998                 }
5999
6000                 dev->reg_state = NETREG_UNREGISTERED;
6001
6002                 on_each_cpu(flush_backlog, dev, 1);
6003
6004                 netdev_wait_allrefs(dev);
6005
6006                 /* paranoia */
6007                 BUG_ON(netdev_refcnt_read(dev));
6008                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6009                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6010                 WARN_ON(dev->dn_ptr);
6011
6012                 if (dev->destructor)
6013                         dev->destructor(dev);
6014
6015                 /* Free network device */
6016                 kobject_put(&dev->dev.kobj);
6017         }
6018 }
6019
6020 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6021  * fields in the same order, with only the type differing.
6022  */
6023 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6024                              const struct net_device_stats *netdev_stats)
6025 {
6026 #if BITS_PER_LONG == 64
6027         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6028         memcpy(stats64, netdev_stats, sizeof(*stats64));
6029 #else
6030         size_t i, n = sizeof(*stats64) / sizeof(u64);
6031         const unsigned long *src = (const unsigned long *)netdev_stats;
6032         u64 *dst = (u64 *)stats64;
6033
6034         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6035                      sizeof(*stats64) / sizeof(u64));
6036         for (i = 0; i < n; i++)
6037                 dst[i] = src[i];
6038 #endif
6039 }
6040 EXPORT_SYMBOL(netdev_stats_to_stats64);
6041
6042 /**
6043  *      dev_get_stats   - get network device statistics
6044  *      @dev: device to get statistics from
6045  *      @storage: place to store stats
6046  *
6047  *      Get network statistics from device. Return @storage.
6048  *      The device driver may provide its own method by setting
6049  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6050  *      otherwise the internal statistics structure is used.
6051  */
6052 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6053                                         struct rtnl_link_stats64 *storage)
6054 {
6055         const struct net_device_ops *ops = dev->netdev_ops;
6056
6057         if (ops->ndo_get_stats64) {
6058                 memset(storage, 0, sizeof(*storage));
6059                 ops->ndo_get_stats64(dev, storage);
6060         } else if (ops->ndo_get_stats) {
6061                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6062         } else {
6063                 netdev_stats_to_stats64(storage, &dev->stats);
6064         }
6065         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6066         return storage;
6067 }
6068 EXPORT_SYMBOL(dev_get_stats);
6069
6070 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6071 {
6072         struct netdev_queue *queue = dev_ingress_queue(dev);
6073
6074 #ifdef CONFIG_NET_CLS_ACT
6075         if (queue)
6076                 return queue;
6077         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6078         if (!queue)
6079                 return NULL;
6080         netdev_init_one_queue(dev, queue, NULL);
6081         queue->qdisc = &noop_qdisc;
6082         queue->qdisc_sleeping = &noop_qdisc;
6083         rcu_assign_pointer(dev->ingress_queue, queue);
6084 #endif
6085         return queue;
6086 }
6087
6088 static const struct ethtool_ops default_ethtool_ops;
6089
6090 /**
6091  *      alloc_netdev_mqs - allocate network device
6092  *      @sizeof_priv:   size of private data to allocate space for
6093  *      @name:          device name format string
6094  *      @setup:         callback to initialize device
6095  *      @txqs:          the number of TX subqueues to allocate
6096  *      @rxqs:          the number of RX subqueues to allocate
6097  *
6098  *      Allocates a struct net_device with private data area for driver use
6099  *      and performs basic initialization.  Also allocates subquue structs
6100  *      for each queue on the device.
6101  */
6102 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6103                 void (*setup)(struct net_device *),
6104                 unsigned int txqs, unsigned int rxqs)
6105 {
6106         struct net_device *dev;
6107         size_t alloc_size;
6108         struct net_device *p;
6109
6110         BUG_ON(strlen(name) >= sizeof(dev->name));
6111
6112         if (txqs < 1) {
6113                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6114                 return NULL;
6115         }
6116
6117 #ifdef CONFIG_RPS
6118         if (rxqs < 1) {
6119                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6120                 return NULL;
6121         }
6122 #endif
6123
6124         alloc_size = sizeof(struct net_device);
6125         if (sizeof_priv) {
6126                 /* ensure 32-byte alignment of private area */
6127                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6128                 alloc_size += sizeof_priv;
6129         }
6130         /* ensure 32-byte alignment of whole construct */
6131         alloc_size += NETDEV_ALIGN - 1;
6132
6133         p = kzalloc(alloc_size, GFP_KERNEL);
6134         if (!p) {
6135                 pr_err("alloc_netdev: Unable to allocate device\n");
6136                 return NULL;
6137         }
6138
6139         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6140         dev->padded = (char *)dev - (char *)p;
6141
6142         dev->pcpu_refcnt = alloc_percpu(int);
6143         if (!dev->pcpu_refcnt)
6144                 goto free_p;
6145
6146         if (dev_addr_init(dev))
6147                 goto free_pcpu;
6148
6149         dev_mc_init(dev);
6150         dev_uc_init(dev);
6151
6152         dev_net_set(dev, &init_net);
6153
6154         dev->gso_max_size = GSO_MAX_SIZE;
6155         dev->gso_max_segs = GSO_MAX_SEGS;
6156
6157         INIT_LIST_HEAD(&dev->napi_list);
6158         INIT_LIST_HEAD(&dev->unreg_list);
6159         INIT_LIST_HEAD(&dev->link_watch_list);
6160         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6161         setup(dev);
6162
6163         dev->num_tx_queues = txqs;
6164         dev->real_num_tx_queues = txqs;
6165         if (netif_alloc_netdev_queues(dev))
6166                 goto free_all;
6167
6168 #ifdef CONFIG_RPS
6169         dev->num_rx_queues = rxqs;
6170         dev->real_num_rx_queues = rxqs;
6171         if (netif_alloc_rx_queues(dev))
6172                 goto free_all;
6173 #endif
6174
6175         strcpy(dev->name, name);
6176         dev->group = INIT_NETDEV_GROUP;
6177         if (!dev->ethtool_ops)
6178                 dev->ethtool_ops = &default_ethtool_ops;
6179         return dev;
6180
6181 free_all:
6182         free_netdev(dev);
6183         return NULL;
6184
6185 free_pcpu:
6186         free_percpu(dev->pcpu_refcnt);
6187         kfree(dev->_tx);
6188 #ifdef CONFIG_RPS
6189         kfree(dev->_rx);
6190 #endif
6191
6192 free_p:
6193         kfree(p);
6194         return NULL;
6195 }
6196 EXPORT_SYMBOL(alloc_netdev_mqs);
6197
6198 /**
6199  *      free_netdev - free network device
6200  *      @dev: device
6201  *
6202  *      This function does the last stage of destroying an allocated device
6203  *      interface. The reference to the device object is released.
6204  *      If this is the last reference then it will be freed.
6205  */
6206 void free_netdev(struct net_device *dev)
6207 {
6208         struct napi_struct *p, *n;
6209
6210         release_net(dev_net(dev));
6211
6212         kfree(dev->_tx);
6213 #ifdef CONFIG_RPS
6214         kfree(dev->_rx);
6215 #endif
6216
6217         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6218
6219         /* Flush device addresses */
6220         dev_addr_flush(dev);
6221
6222         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6223                 netif_napi_del(p);
6224
6225         free_percpu(dev->pcpu_refcnt);
6226         dev->pcpu_refcnt = NULL;
6227
6228         /*  Compatibility with error handling in drivers */
6229         if (dev->reg_state == NETREG_UNINITIALIZED) {
6230                 kfree((char *)dev - dev->padded);
6231                 return;
6232         }
6233
6234         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6235         dev->reg_state = NETREG_RELEASED;
6236
6237         /* will free via device release */
6238         put_device(&dev->dev);
6239 }
6240 EXPORT_SYMBOL(free_netdev);
6241
6242 /**
6243  *      synchronize_net -  Synchronize with packet receive processing
6244  *
6245  *      Wait for packets currently being received to be done.
6246  *      Does not block later packets from starting.
6247  */
6248 void synchronize_net(void)
6249 {
6250         might_sleep();
6251         if (rtnl_is_locked())
6252                 synchronize_rcu_expedited();
6253         else
6254                 synchronize_rcu();
6255 }
6256 EXPORT_SYMBOL(synchronize_net);
6257
6258 /**
6259  *      unregister_netdevice_queue - remove device from the kernel
6260  *      @dev: device
6261  *      @head: list
6262  *
6263  *      This function shuts down a device interface and removes it
6264  *      from the kernel tables.
6265  *      If head not NULL, device is queued to be unregistered later.
6266  *
6267  *      Callers must hold the rtnl semaphore.  You may want
6268  *      unregister_netdev() instead of this.
6269  */
6270
6271 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6272 {
6273         ASSERT_RTNL();
6274
6275         if (head) {
6276                 list_move_tail(&dev->unreg_list, head);
6277         } else {
6278                 rollback_registered(dev);
6279                 /* Finish processing unregister after unlock */
6280                 net_set_todo(dev);
6281         }
6282 }
6283 EXPORT_SYMBOL(unregister_netdevice_queue);
6284
6285 /**
6286  *      unregister_netdevice_many - unregister many devices
6287  *      @head: list of devices
6288  */
6289 void unregister_netdevice_many(struct list_head *head)
6290 {
6291         struct net_device *dev;
6292
6293         if (!list_empty(head)) {
6294                 rollback_registered_many(head);
6295                 list_for_each_entry(dev, head, unreg_list)
6296                         net_set_todo(dev);
6297         }
6298 }
6299 EXPORT_SYMBOL(unregister_netdevice_many);
6300
6301 /**
6302  *      unregister_netdev - remove device from the kernel
6303  *      @dev: device
6304  *
6305  *      This function shuts down a device interface and removes it
6306  *      from the kernel tables.
6307  *
6308  *      This is just a wrapper for unregister_netdevice that takes
6309  *      the rtnl semaphore.  In general you want to use this and not
6310  *      unregister_netdevice.
6311  */
6312 void unregister_netdev(struct net_device *dev)
6313 {
6314         rtnl_lock();
6315         unregister_netdevice(dev);
6316         rtnl_unlock();
6317 }
6318 EXPORT_SYMBOL(unregister_netdev);
6319
6320 /**
6321  *      dev_change_net_namespace - move device to different nethost namespace
6322  *      @dev: device
6323  *      @net: network namespace
6324  *      @pat: If not NULL name pattern to try if the current device name
6325  *            is already taken in the destination network namespace.
6326  *
6327  *      This function shuts down a device interface and moves it
6328  *      to a new network namespace. On success 0 is returned, on
6329  *      a failure a netagive errno code is returned.
6330  *
6331  *      Callers must hold the rtnl semaphore.
6332  */
6333
6334 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6335 {
6336         int err;
6337
6338         ASSERT_RTNL();
6339
6340         /* Don't allow namespace local devices to be moved. */
6341         err = -EINVAL;
6342         if (dev->features & NETIF_F_NETNS_LOCAL)
6343                 goto out;
6344
6345         /* Ensure the device has been registrered */
6346         if (dev->reg_state != NETREG_REGISTERED)
6347                 goto out;
6348
6349         /* Get out if there is nothing todo */
6350         err = 0;
6351         if (net_eq(dev_net(dev), net))
6352                 goto out;
6353
6354         /* Pick the destination device name, and ensure
6355          * we can use it in the destination network namespace.
6356          */
6357         err = -EEXIST;
6358         if (__dev_get_by_name(net, dev->name)) {
6359                 /* We get here if we can't use the current device name */
6360                 if (!pat)
6361                         goto out;
6362                 if (dev_get_valid_name(net, dev, pat) < 0)
6363                         goto out;
6364         }
6365
6366         /*
6367          * And now a mini version of register_netdevice unregister_netdevice.
6368          */
6369
6370         /* If device is running close it first. */
6371         dev_close(dev);
6372
6373         /* And unlink it from device chain */
6374         err = -ENODEV;
6375         unlist_netdevice(dev);
6376
6377         synchronize_net();
6378
6379         /* Shutdown queueing discipline. */
6380         dev_shutdown(dev);
6381
6382         /* Notify protocols, that we are about to destroy
6383            this device. They should clean all the things.
6384
6385            Note that dev->reg_state stays at NETREG_REGISTERED.
6386            This is wanted because this way 8021q and macvlan know
6387            the device is just moving and can keep their slaves up.
6388         */
6389         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6390         rcu_barrier();
6391         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6392         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6393
6394         /*
6395          *      Flush the unicast and multicast chains
6396          */
6397         dev_uc_flush(dev);
6398         dev_mc_flush(dev);
6399
6400         /* Actually switch the network namespace */
6401         dev_net_set(dev, net);
6402
6403         /* If there is an ifindex conflict assign a new one */
6404         if (__dev_get_by_index(net, dev->ifindex)) {
6405                 int iflink = (dev->iflink == dev->ifindex);
6406                 dev->ifindex = dev_new_index(net);
6407                 if (iflink)
6408                         dev->iflink = dev->ifindex;
6409         }
6410
6411         /* Fixup kobjects */
6412         err = device_rename(&dev->dev, dev->name);
6413         WARN_ON(err);
6414
6415         /* Add the device back in the hashes */
6416         list_netdevice(dev);
6417
6418         /* Notify protocols, that a new device appeared. */
6419         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6420
6421         /*
6422          *      Prevent userspace races by waiting until the network
6423          *      device is fully setup before sending notifications.
6424          */
6425         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6426
6427         synchronize_net();
6428         err = 0;
6429 out:
6430         return err;
6431 }
6432 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6433
6434 static int dev_cpu_callback(struct notifier_block *nfb,
6435                             unsigned long action,
6436                             void *ocpu)
6437 {
6438         struct sk_buff **list_skb;
6439         struct sk_buff *skb;
6440         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6441         struct softnet_data *sd, *oldsd;
6442
6443         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6444                 return NOTIFY_OK;
6445
6446         local_irq_disable();
6447         cpu = smp_processor_id();
6448         sd = &per_cpu(softnet_data, cpu);
6449         oldsd = &per_cpu(softnet_data, oldcpu);
6450
6451         /* Find end of our completion_queue. */
6452         list_skb = &sd->completion_queue;
6453         while (*list_skb)
6454                 list_skb = &(*list_skb)->next;
6455         /* Append completion queue from offline CPU. */
6456         *list_skb = oldsd->completion_queue;
6457         oldsd->completion_queue = NULL;
6458
6459         /* Append output queue from offline CPU. */
6460         if (oldsd->output_queue) {
6461                 *sd->output_queue_tailp = oldsd->output_queue;
6462                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6463                 oldsd->output_queue = NULL;
6464                 oldsd->output_queue_tailp = &oldsd->output_queue;
6465         }
6466         /* Append NAPI poll list from offline CPU. */
6467         if (!list_empty(&oldsd->poll_list)) {
6468                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6469                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6470         }
6471
6472         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6473         local_irq_enable();
6474
6475         /* Process offline CPU's input_pkt_queue */
6476         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6477                 netif_rx(skb);
6478                 input_queue_head_incr(oldsd);
6479         }
6480         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6481                 netif_rx(skb);
6482                 input_queue_head_incr(oldsd);
6483         }
6484
6485         return NOTIFY_OK;
6486 }
6487
6488
6489 /**
6490  *      netdev_increment_features - increment feature set by one
6491  *      @all: current feature set
6492  *      @one: new feature set
6493  *      @mask: mask feature set
6494  *
6495  *      Computes a new feature set after adding a device with feature set
6496  *      @one to the master device with current feature set @all.  Will not
6497  *      enable anything that is off in @mask. Returns the new feature set.
6498  */
6499 netdev_features_t netdev_increment_features(netdev_features_t all,
6500         netdev_features_t one, netdev_features_t mask)
6501 {
6502         if (mask & NETIF_F_GEN_CSUM)
6503                 mask |= NETIF_F_ALL_CSUM;
6504         mask |= NETIF_F_VLAN_CHALLENGED;
6505
6506         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6507         all &= one | ~NETIF_F_ALL_FOR_ALL;
6508
6509         /* If one device supports hw checksumming, set for all. */
6510         if (all & NETIF_F_GEN_CSUM)
6511                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6512
6513         return all;
6514 }
6515 EXPORT_SYMBOL(netdev_increment_features);
6516
6517 static struct hlist_head *netdev_create_hash(void)
6518 {
6519         int i;
6520         struct hlist_head *hash;
6521
6522         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6523         if (hash != NULL)
6524                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6525                         INIT_HLIST_HEAD(&hash[i]);
6526
6527         return hash;
6528 }
6529
6530 /* Initialize per network namespace state */
6531 static int __net_init netdev_init(struct net *net)
6532 {
6533         if (net != &init_net)
6534                 INIT_LIST_HEAD(&net->dev_base_head);
6535
6536         net->dev_name_head = netdev_create_hash();
6537         if (net->dev_name_head == NULL)
6538                 goto err_name;
6539
6540         net->dev_index_head = netdev_create_hash();
6541         if (net->dev_index_head == NULL)
6542                 goto err_idx;
6543
6544         return 0;
6545
6546 err_idx:
6547         kfree(net->dev_name_head);
6548 err_name:
6549         return -ENOMEM;
6550 }
6551
6552 /**
6553  *      netdev_drivername - network driver for the device
6554  *      @dev: network device
6555  *
6556  *      Determine network driver for device.
6557  */
6558 const char *netdev_drivername(const struct net_device *dev)
6559 {
6560         const struct device_driver *driver;
6561         const struct device *parent;
6562         const char *empty = "";
6563
6564         parent = dev->dev.parent;
6565         if (!parent)
6566                 return empty;
6567
6568         driver = parent->driver;
6569         if (driver && driver->name)
6570                 return driver->name;
6571         return empty;
6572 }
6573
6574 static int __netdev_printk(const char *level, const struct net_device *dev,
6575                            struct va_format *vaf)
6576 {
6577         int r;
6578
6579         if (dev && dev->dev.parent) {
6580                 r = dev_printk_emit(level[1] - '0',
6581                                     dev->dev.parent,
6582                                     "%s %s %s: %pV",
6583                                     dev_driver_string(dev->dev.parent),
6584                                     dev_name(dev->dev.parent),
6585                                     netdev_name(dev), vaf);
6586         } else if (dev) {
6587                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6588         } else {
6589                 r = printk("%s(NULL net_device): %pV", level, vaf);
6590         }
6591
6592         return r;
6593 }
6594
6595 int netdev_printk(const char *level, const struct net_device *dev,
6596                   const char *format, ...)
6597 {
6598         struct va_format vaf;
6599         va_list args;
6600         int r;
6601
6602         va_start(args, format);
6603
6604         vaf.fmt = format;
6605         vaf.va = &args;
6606
6607         r = __netdev_printk(level, dev, &vaf);
6608
6609         va_end(args);
6610
6611         return r;
6612 }
6613 EXPORT_SYMBOL(netdev_printk);
6614
6615 #define define_netdev_printk_level(func, level)                 \
6616 int func(const struct net_device *dev, const char *fmt, ...)    \
6617 {                                                               \
6618         int r;                                                  \
6619         struct va_format vaf;                                   \
6620         va_list args;                                           \
6621                                                                 \
6622         va_start(args, fmt);                                    \
6623                                                                 \
6624         vaf.fmt = fmt;                                          \
6625         vaf.va = &args;                                         \
6626                                                                 \
6627         r = __netdev_printk(level, dev, &vaf);                  \
6628                                                                 \
6629         va_end(args);                                           \
6630                                                                 \
6631         return r;                                               \
6632 }                                                               \
6633 EXPORT_SYMBOL(func);
6634
6635 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6636 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6637 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6638 define_netdev_printk_level(netdev_err, KERN_ERR);
6639 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6640 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6641 define_netdev_printk_level(netdev_info, KERN_INFO);
6642
6643 static void __net_exit netdev_exit(struct net *net)
6644 {
6645         kfree(net->dev_name_head);
6646         kfree(net->dev_index_head);
6647 }
6648
6649 static struct pernet_operations __net_initdata netdev_net_ops = {
6650         .init = netdev_init,
6651         .exit = netdev_exit,
6652 };
6653
6654 static void __net_exit default_device_exit(struct net *net)
6655 {
6656         struct net_device *dev, *aux;
6657         /*
6658          * Push all migratable network devices back to the
6659          * initial network namespace
6660          */
6661         rtnl_lock();
6662         for_each_netdev_safe(net, dev, aux) {
6663                 int err;
6664                 char fb_name[IFNAMSIZ];
6665
6666                 /* Ignore unmoveable devices (i.e. loopback) */
6667                 if (dev->features & NETIF_F_NETNS_LOCAL)
6668                         continue;
6669
6670                 /* Leave virtual devices for the generic cleanup */
6671                 if (dev->rtnl_link_ops)
6672                         continue;
6673
6674                 /* Push remaining network devices to init_net */
6675                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6676                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6677                 if (err) {
6678                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6679                                  __func__, dev->name, err);
6680                         BUG();
6681                 }
6682         }
6683         rtnl_unlock();
6684 }
6685
6686 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6687 {
6688         /* At exit all network devices most be removed from a network
6689          * namespace.  Do this in the reverse order of registration.
6690          * Do this across as many network namespaces as possible to
6691          * improve batching efficiency.
6692          */
6693         struct net_device *dev;
6694         struct net *net;
6695         LIST_HEAD(dev_kill_list);
6696
6697         rtnl_lock();
6698         list_for_each_entry(net, net_list, exit_list) {
6699                 for_each_netdev_reverse(net, dev) {
6700                         if (dev->rtnl_link_ops)
6701                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6702                         else
6703                                 unregister_netdevice_queue(dev, &dev_kill_list);
6704                 }
6705         }
6706         unregister_netdevice_many(&dev_kill_list);
6707         list_del(&dev_kill_list);
6708         rtnl_unlock();
6709 }
6710
6711 static struct pernet_operations __net_initdata default_device_ops = {
6712         .exit = default_device_exit,
6713         .exit_batch = default_device_exit_batch,
6714 };
6715
6716 /*
6717  *      Initialize the DEV module. At boot time this walks the device list and
6718  *      unhooks any devices that fail to initialise (normally hardware not
6719  *      present) and leaves us with a valid list of present and active devices.
6720  *
6721  */
6722
6723 /*
6724  *       This is called single threaded during boot, so no need
6725  *       to take the rtnl semaphore.
6726  */
6727 static int __init net_dev_init(void)
6728 {
6729         int i, rc = -ENOMEM;
6730
6731         BUG_ON(!dev_boot_phase);
6732
6733         if (dev_proc_init())
6734                 goto out;
6735
6736         if (netdev_kobject_init())
6737                 goto out;
6738
6739         INIT_LIST_HEAD(&ptype_all);
6740         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6741                 INIT_LIST_HEAD(&ptype_base[i]);
6742
6743         INIT_LIST_HEAD(&offload_base);
6744
6745         if (register_pernet_subsys(&netdev_net_ops))
6746                 goto out;
6747
6748         /*
6749          *      Initialise the packet receive queues.
6750          */
6751
6752         for_each_possible_cpu(i) {
6753                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6754
6755                 memset(sd, 0, sizeof(*sd));
6756                 skb_queue_head_init(&sd->input_pkt_queue);
6757                 skb_queue_head_init(&sd->process_queue);
6758                 sd->completion_queue = NULL;
6759                 INIT_LIST_HEAD(&sd->poll_list);
6760                 sd->output_queue = NULL;
6761                 sd->output_queue_tailp = &sd->output_queue;
6762 #ifdef CONFIG_RPS
6763                 sd->csd.func = rps_trigger_softirq;
6764                 sd->csd.info = sd;
6765                 sd->csd.flags = 0;
6766                 sd->cpu = i;
6767 #endif
6768
6769                 sd->backlog.poll = process_backlog;
6770                 sd->backlog.weight = weight_p;
6771                 sd->backlog.gro_list = NULL;
6772                 sd->backlog.gro_count = 0;
6773         }
6774
6775         dev_boot_phase = 0;
6776
6777         /* The loopback device is special if any other network devices
6778          * is present in a network namespace the loopback device must
6779          * be present. Since we now dynamically allocate and free the
6780          * loopback device ensure this invariant is maintained by
6781          * keeping the loopback device as the first device on the
6782          * list of network devices.  Ensuring the loopback devices
6783          * is the first device that appears and the last network device
6784          * that disappears.
6785          */
6786         if (register_pernet_device(&loopback_net_ops))
6787                 goto out;
6788
6789         if (register_pernet_device(&default_device_ops))
6790                 goto out;
6791
6792         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6793         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6794
6795         hotcpu_notifier(dev_cpu_callback, 0);
6796         dst_init();
6797         dev_mcast_init();
6798         rc = 0;
6799 out:
6800         return rc;
6801 }
6802
6803 subsys_initcall(net_dev_init);
6804
6805 static int __init initialize_hashrnd(void)
6806 {
6807         get_random_bytes(&hashrnd, sizeof(hashrnd));
6808         return 0;
6809 }
6810
6811 late_initcall_sync(initialize_hashrnd);
6812