net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 /*
 145  *      The list of packet types we will receive (as opposed to discard)
 146  *      and the routines to invoke.
 147  *
 148  *      Why 16. Because with 16 the only overlap we get on a hash of the
 149  *      low nibble of the protocol value is RARP/SNAP/X.25.
 150  *
 151  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 152  *             sure which should go first, but I bet it won't make much
 153  *             difference if we are running VLANs.  The good news is that
 154  *             this protocol won't be in the list unless compiled in, so
 155  *             the average user (w/out VLANs) will not be adversely affected.
 156  *             --BLG
 157  *
 158  *              0800    IP
 159  *              8100    802.1Q VLAN
 160  *              0001    802.3
 161  *              0002    AX.25
 162  *              0004    802.2
 163  *              8035    RARP
 164  *              0005    SNAP
 165  *              0805    X.25
 166  *              0806    ARP
 167  *              8137    IPX
 168  *              0009    Localtalk
 169  *              86DD    IPv6
 170  */
 171
 172 #define PTYPE_HASH_SIZE (16)
 173 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 174
 175 static DEFINE_SPINLOCK(ptype_lock);
 176 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 177 static struct list_head ptype_all __read_mostly;        /* Taps */
 178
 179 /*
 180  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 181  * semaphore.
 182  *
 183  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 184  *
 185  * Writers must hold the rtnl semaphore while they loop through the
 186  * dev_base_head list, and hold dev_base_lock for writing when they do the
 187  * actual updates.  This allows pure readers to access the list even
 188  * while a writer is preparing to update it.
 189  *
 190  * To put it another way, dev_base_lock is held for writing only to
 191  * protect against pure readers; the rtnl semaphore provides the
 192  * protection against other writers.
 193  *
 194  * See, for example usages, register_netdevice() and
 195  * unregister_netdevice(), which must be called with the rtnl
 196  * semaphore held.
 197  */
 198 DEFINE_RWLOCK(dev_base_lock);
 199 EXPORT_SYMBOL(dev_base_lock);
 200
 201 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 202 {
 203         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 204         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 205 }
 206
 207 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 208 {
 209         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 210 }
 211
 212 static inline void rps_lock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_lock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 static inline void rps_unlock(struct softnet_data *sd)
 220 {
 221 #ifdef CONFIG_RPS
 222         spin_unlock(&sd->input_pkt_queue.lock);
 223 #endif
 224 }
 225
 226 /* Device list insertion */
 227 static int list_netdevice(struct net_device *dev)
 228 {
 229         struct net *net = dev_net(dev);
 230
 231         ASSERT_RTNL();
 232
 233         write_lock_bh(&dev_base_lock);
 234         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 235         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 236         hlist_add_head_rcu(&dev->index_hlist,
 237                            dev_index_hash(net, dev->ifindex));
 238         write_unlock_bh(&dev_base_lock);
 239         return 0;
 240 }
 241
 242 /* Device list removal
 243  * caller must respect a RCU grace period before freeing/reusing dev
 244  */
 245 static void unlist_netdevice(struct net_device *dev)
 246 {
 247         ASSERT_RTNL();
 248
 249         /* Unlink dev from the device chain */
 250         write_lock_bh(&dev_base_lock);
 251         list_del_rcu(&dev->dev_list);
 252         hlist_del_rcu(&dev->name_hlist);
 253         hlist_del_rcu(&dev->index_hlist);
 254         write_unlock_bh(&dev_base_lock);
 255 }
 256
 257 /*
 258  *      Our notifier list
 259  */
 260
 261 static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263 /*
 264  *      Device drivers call our routines to queue packets here. We empty the
 265  *      queue in the local softnet handler.
 266  */
 267
 268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269 EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271 #ifdef CONFIG_LOCKDEP
 272 /*
 273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274  * according to dev->type
 275  */
 276 static const unsigned short netdev_lock_type[] =
 277         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 290          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 291          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 292          ARPHRD_VOID, ARPHRD_NONE};
 293
 294 static const char *const netdev_lock_name[] =
 295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 308          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 309          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 310          "_xmit_VOID", "_xmit_NONE"};
 311
 312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314
 315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 316 {
 317         int i;
 318
 319         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 320                 if (netdev_lock_type[i] == dev_type)
 321                         return i;
 322         /* the last key is used by default */
 323         return ARRAY_SIZE(netdev_lock_type) - 1;
 324 }
 325
 326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 327                                                  unsigned short dev_type)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev_type);
 332         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 333                                    netdev_lock_name[i]);
 334 }
 335
 336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337 {
 338         int i;
 339
 340         i = netdev_lock_pos(dev->type);
 341         lockdep_set_class_and_name(&dev->addr_list_lock,
 342                                    &netdev_addr_lock_key[i],
 343                                    netdev_lock_name[i]);
 344 }
 345 #else
 346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 347                                                  unsigned short dev_type)
 348 {
 349 }
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352 }
 353 #endif
 354
 355 /*******************************************************************************
 356
 357                 Protocol management and registration routines
 358
 359 *******************************************************************************/
 360
 361 /*
 362  *      Add a protocol ID to the list. Now that the input handler is
 363  *      smarter we can dispense with all the messy stuff that used to be
 364  *      here.
 365  *
 366  *      BEWARE!!! Protocol handlers, mangling input packets,
 367  *      MUST BE last in hash buckets and checking protocol handlers
 368  *      MUST start from promiscuous ptype_all chain in net_bh.
 369  *      It is true now, do not change it.
 370  *      Explanation follows: if protocol handler, mangling packet, will
 371  *      be the first on list, it is not able to sense, that packet
 372  *      is cloned and should be copied-on-write, so that it will
 373  *      change it and subsequent readers will get broken packet.
 374  *                                                      --ANK (980803)
 375  */
 376
 377 static inline struct list_head *ptype_head(const struct packet_type *pt)
 378 {
 379         if (pt->type == htons(ETH_P_ALL))
 380                 return &ptype_all;
 381         else
 382                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383 }
 384
 385 /**
 386  *      dev_add_pack - add packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Add a protocol handler to the networking stack. The passed &packet_type
 390  *      is linked into kernel lists and may not be freed until it has been
 391  *      removed from the kernel lists.
 392  *
 393  *      This call does not sleep therefore it can not
 394  *      guarantee all CPU's that are in middle of receiving packets
 395  *      will see the new packet type (until the next received packet).
 396  */
 397
 398 void dev_add_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401
 402         spin_lock(&ptype_lock);
 403         list_add_rcu(&pt->list, head);
 404         spin_unlock(&ptype_lock);
 405 }
 406 EXPORT_SYMBOL(dev_add_pack);
 407
 408 /**
 409  *      __dev_remove_pack        - remove packet handler
 410  *      @pt: packet type declaration
 411  *
 412  *      Remove a protocol handler that was previously added to the kernel
 413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414  *      from the kernel lists and can be freed or reused once this function
 415  *      returns.
 416  *
 417  *      The packet type might still be in use by receivers
 418  *      and must not be freed until after all the CPU's have gone
 419  *      through a quiescent state.
 420  */
 421 void __dev_remove_pack(struct packet_type *pt)
 422 {
 423         struct list_head *head = ptype_head(pt);
 424         struct packet_type *pt1;
 425
 426         spin_lock(&ptype_lock);
 427
 428         list_for_each_entry(pt1, head, list) {
 429                 if (pt == pt1) {
 430                         list_del_rcu(&pt->list);
 431                         goto out;
 432                 }
 433         }
 434
 435         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 436 out:
 437         spin_unlock(&ptype_lock);
 438 }
 439 EXPORT_SYMBOL(__dev_remove_pack);
 440
 441 /**
 442  *      dev_remove_pack  - remove packet handler
 443  *      @pt: packet type declaration
 444  *
 445  *      Remove a protocol handler that was previously added to the kernel
 446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447  *      from the kernel lists and can be freed or reused once this function
 448  *      returns.
 449  *
 450  *      This call sleeps to guarantee that no CPU is looking at the packet
 451  *      type after return.
 452  */
 453 void dev_remove_pack(struct packet_type *pt)
 454 {
 455         __dev_remove_pack(pt);
 456
 457         synchronize_net();
 458 }
 459 EXPORT_SYMBOL(dev_remove_pack);
 460
 461 /******************************************************************************
 462
 463                       Device Boot-time Settings Routines
 464
 465 *******************************************************************************/
 466
 467 /* Boot time configuration table */
 468 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 469
 470 /**
 471  *      netdev_boot_setup_add   - add new setup entry
 472  *      @name: name of the device
 473  *      @map: configured settings for the device
 474  *
 475  *      Adds new setup entry to the dev_boot_setup list.  The function
 476  *      returns 0 on error and 1 on success.  This is a generic routine to
 477  *      all netdevices.
 478  */
 479 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 480 {
 481         struct netdev_boot_setup *s;
 482         int i;
 483
 484         s = dev_boot_setup;
 485         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 486                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 487                         memset(s[i].name, 0, sizeof(s[i].name));
 488                         strlcpy(s[i].name, name, IFNAMSIZ);
 489                         memcpy(&s[i].map, map, sizeof(s[i].map));
 490                         break;
 491                 }
 492         }
 493
 494         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 495 }
 496
 497 /**
 498  *      netdev_boot_setup_check - check boot time settings
 499  *      @dev: the netdevice
 500  *
 501  *      Check boot time settings for the device.
 502  *      The found settings are set for the device to be used
 503  *      later in the device probing.
 504  *      Returns 0 if no settings found, 1 if they are.
 505  */
 506 int netdev_boot_setup_check(struct net_device *dev)
 507 {
 508         struct netdev_boot_setup *s = dev_boot_setup;
 509         int i;
 510
 511         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 512                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 513                     !strcmp(dev->name, s[i].name)) {
 514                         dev->irq        = s[i].map.irq;
 515                         dev->base_addr  = s[i].map.base_addr;
 516                         dev->mem_start  = s[i].map.mem_start;
 517                         dev->mem_end    = s[i].map.mem_end;
 518                         return 1;
 519                 }
 520         }
 521         return 0;
 522 }
 523 EXPORT_SYMBOL(netdev_boot_setup_check);
 524
 525
 526 /**
 527  *      netdev_boot_base        - get address from boot time settings
 528  *      @prefix: prefix for network device
 529  *      @unit: id for network device
 530  *
 531  *      Check boot time settings for the base address of device.
 532  *      The found settings are set for the device to be used
 533  *      later in the device probing.
 534  *      Returns 0 if no settings found.
 535  */
 536 unsigned long netdev_boot_base(const char *prefix, int unit)
 537 {
 538         const struct netdev_boot_setup *s = dev_boot_setup;
 539         char name[IFNAMSIZ];
 540         int i;
 541
 542         sprintf(name, "%s%d", prefix, unit);
 543
 544         /*
 545          * If device already registered then return base of 1
 546          * to indicate not to probe for this interface
 547          */
 548         if (__dev_get_by_name(&init_net, name))
 549                 return 1;
 550
 551         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 552                 if (!strcmp(name, s[i].name))
 553                         return s[i].map.base_addr;
 554         return 0;
 555 }
 556
 557 /*
 558  * Saves at boot time configured settings for any netdevice.
 559  */
 560 int __init netdev_boot_setup(char *str)
 561 {
 562         int ints[5];
 563         struct ifmap map;
 564
 565         str = get_options(str, ARRAY_SIZE(ints), ints);
 566         if (!str || !*str)
 567                 return 0;
 568
 569         /* Save settings */
 570         memset(&map, 0, sizeof(map));
 571         if (ints[0] > 0)
 572                 map.irq = ints[1];
 573         if (ints[0] > 1)
 574                 map.base_addr = ints[2];
 575         if (ints[0] > 2)
 576                 map.mem_start = ints[3];
 577         if (ints[0] > 3)
 578                 map.mem_end = ints[4];
 579
 580         /* Add new entry to the list */
 581         return netdev_boot_setup_add(str, &map);
 582 }
 583
 584 __setup("netdev=", netdev_boot_setup);
 585
 586 /*******************************************************************************
 587
 588                             Device Interface Subroutines
 589
 590 *******************************************************************************/
 591
 592 /**
 593  *      __dev_get_by_name       - find a device by its name
 594  *      @net: the applicable net namespace
 595  *      @name: name to find
 596  *
 597  *      Find an interface by name. Must be called under RTNL semaphore
 598  *      or @dev_base_lock. If the name is found a pointer to the device
 599  *      is returned. If the name is not found then %NULL is returned. The
 600  *      reference counters are not incremented so the caller must be
 601  *      careful with locks.
 602  */
 603
 604 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 605 {
 606         struct hlist_node *p;
 607         struct net_device *dev;
 608         struct hlist_head *head = dev_name_hash(net, name);
 609
 610         hlist_for_each_entry(dev, p, head, name_hlist)
 611                 if (!strncmp(dev->name, name, IFNAMSIZ))
 612                         return dev;
 613
 614         return NULL;
 615 }
 616 EXPORT_SYMBOL(__dev_get_by_name);
 617
 618 /**
 619  *      dev_get_by_name_rcu     - find a device by its name
 620  *      @net: the applicable net namespace
 621  *      @name: name to find
 622  *
 623  *      Find an interface by name.
 624  *      If the name is found a pointer to the device is returned.
 625  *      If the name is not found then %NULL is returned.
 626  *      The reference counters are not incremented so the caller must be
 627  *      careful with locks. The caller must hold RCU lock.
 628  */
 629
 630 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 631 {
 632         struct hlist_node *p;
 633         struct net_device *dev;
 634         struct hlist_head *head = dev_name_hash(net, name);
 635
 636         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 637                 if (!strncmp(dev->name, name, IFNAMSIZ))
 638                         return dev;
 639
 640         return NULL;
 641 }
 642 EXPORT_SYMBOL(dev_get_by_name_rcu);
 643
 644 /**
 645  *      dev_get_by_name         - find a device by its name
 646  *      @net: the applicable net namespace
 647  *      @name: name to find
 648  *
 649  *      Find an interface by name. This can be called from any
 650  *      context and does its own locking. The returned handle has
 651  *      the usage count incremented and the caller must use dev_put() to
 652  *      release it when it is no longer needed. %NULL is returned if no
 653  *      matching device is found.
 654  */
 655
 656 struct net_device *dev_get_by_name(struct net *net, const char *name)
 657 {
 658         struct net_device *dev;
 659
 660         rcu_read_lock();
 661         dev = dev_get_by_name_rcu(net, name);
 662         if (dev)
 663                 dev_hold(dev);
 664         rcu_read_unlock();
 665         return dev;
 666 }
 667 EXPORT_SYMBOL(dev_get_by_name);
 668
 669 /**
 670  *      __dev_get_by_index - find a device by its ifindex
 671  *      @net: the applicable net namespace
 672  *      @ifindex: index of device
 673  *
 674  *      Search for an interface by index. Returns %NULL if the device
 675  *      is not found or a pointer to the device. The device has not
 676  *      had its reference counter increased so the caller must be careful
 677  *      about locking. The caller must hold either the RTNL semaphore
 678  *      or @dev_base_lock.
 679  */
 680
 681 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 682 {
 683         struct hlist_node *p;
 684         struct net_device *dev;
 685         struct hlist_head *head = dev_index_hash(net, ifindex);
 686
 687         hlist_for_each_entry(dev, p, head, index_hlist)
 688                 if (dev->ifindex == ifindex)
 689                         return dev;
 690
 691         return NULL;
 692 }
 693 EXPORT_SYMBOL(__dev_get_by_index);
 694
 695 /**
 696  *      dev_get_by_index_rcu - find a device by its ifindex
 697  *      @net: the applicable net namespace
 698  *      @ifindex: index of device
 699  *
 700  *      Search for an interface by index. Returns %NULL if the device
 701  *      is not found or a pointer to the device. The device has not
 702  *      had its reference counter increased so the caller must be careful
 703  *      about locking. The caller must hold RCU lock.
 704  */
 705
 706 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 707 {
 708         struct hlist_node *p;
 709         struct net_device *dev;
 710         struct hlist_head *head = dev_index_hash(net, ifindex);
 711
 712         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 713                 if (dev->ifindex == ifindex)
 714                         return dev;
 715
 716         return NULL;
 717 }
 718 EXPORT_SYMBOL(dev_get_by_index_rcu);
 719
 720
 721 /**
 722  *      dev_get_by_index - find a device by its ifindex
 723  *      @net: the applicable net namespace
 724  *      @ifindex: index of device
 725  *
 726  *      Search for an interface by index. Returns NULL if the device
 727  *      is not found or a pointer to the device. The device returned has
 728  *      had a reference added and the pointer is safe until the user calls
 729  *      dev_put to indicate they have finished with it.
 730  */
 731
 732 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 733 {
 734         struct net_device *dev;
 735
 736         rcu_read_lock();
 737         dev = dev_get_by_index_rcu(net, ifindex);
 738         if (dev)
 739                 dev_hold(dev);
 740         rcu_read_unlock();
 741         return dev;
 742 }
 743 EXPORT_SYMBOL(dev_get_by_index);
 744
 745 /**
 746  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 747  *      @net: the applicable net namespace
 748  *      @type: media type of device
 749  *      @ha: hardware address
 750  *
 751  *      Search for an interface by MAC address. Returns NULL if the device
 752  *      is not found or a pointer to the device. The caller must hold RCU
 753  *      The returned device has not had its ref count increased
 754  *      and the caller must therefore be careful about locking
 755  *
 756  */
 757
 758 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 759                                        const char *ha)
 760 {
 761         struct net_device *dev;
 762
 763         for_each_netdev_rcu(net, dev)
 764                 if (dev->type == type &&
 765                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 766                         return dev;
 767
 768         return NULL;
 769 }
 770 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 771
 772 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 773 {
 774         struct net_device *dev;
 775
 776         ASSERT_RTNL();
 777         for_each_netdev(net, dev)
 778                 if (dev->type == type)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 784
 785 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 786 {
 787         struct net_device *dev, *ret = NULL;
 788
 789         rcu_read_lock();
 790         for_each_netdev_rcu(net, dev)
 791                 if (dev->type == type) {
 792                         dev_hold(dev);
 793                         ret = dev;
 794                         break;
 795                 }
 796         rcu_read_unlock();
 797         return ret;
 798 }
 799 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 800
 801 /**
 802  *      dev_get_by_flags_rcu - find any device with given flags
 803  *      @net: the applicable net namespace
 804  *      @if_flags: IFF_* values
 805  *      @mask: bitmask of bits in if_flags to check
 806  *
 807  *      Search for any interface with the given flags. Returns NULL if a device
 808  *      is not found or a pointer to the device. Must be called inside
 809  *      rcu_read_lock(), and result refcount is unchanged.
 810  */
 811
 812 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 813                                     unsigned short mask)
 814 {
 815         struct net_device *dev, *ret;
 816
 817         ret = NULL;
 818         for_each_netdev_rcu(net, dev) {
 819                 if (((dev->flags ^ if_flags) & mask) == 0) {
 820                         ret = dev;
 821                         break;
 822                 }
 823         }
 824         return ret;
 825 }
 826 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 827
 828 /**
 829  *      dev_valid_name - check if name is okay for network device
 830  *      @name: name string
 831  *
 832  *      Network device names need to be valid file names to
 833  *      to allow sysfs to work.  We also disallow any kind of
 834  *      whitespace.
 835  */
 836 int dev_valid_name(const char *name)
 837 {
 838         if (*name == '\0')
 839                 return 0;
 840         if (strlen(name) >= IFNAMSIZ)
 841                 return 0;
 842         if (!strcmp(name, ".") || !strcmp(name, ".."))
 843                 return 0;
 844
 845         while (*name) {
 846                 if (*name == '/' || isspace(*name))
 847                         return 0;
 848                 name++;
 849         }
 850         return 1;
 851 }
 852 EXPORT_SYMBOL(dev_valid_name);
 853
 854 /**
 855  *      __dev_alloc_name - allocate a name for a device
 856  *      @net: network namespace to allocate the device name in
 857  *      @name: name format string
 858  *      @buf:  scratch buffer and result name string
 859  *
 860  *      Passed a format string - eg "lt%d" it will try and find a suitable
 861  *      id. It scans list of devices to build up a free map, then chooses
 862  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 863  *      while allocating the name and adding the device in order to avoid
 864  *      duplicates.
 865  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 866  *      Returns the number of the unit assigned or a negative errno code.
 867  */
 868
 869 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 870 {
 871         int i = 0;
 872         const char *p;
 873         const int max_netdevices = 8*PAGE_SIZE;
 874         unsigned long *inuse;
 875         struct net_device *d;
 876
 877         p = strnchr(name, IFNAMSIZ-1, '%');
 878         if (p) {
 879                 /*
 880                  * Verify the string as this thing may have come from
 881                  * the user.  There must be either one "%d" and no other "%"
 882                  * characters.
 883                  */
 884                 if (p[1] != 'd' || strchr(p + 2, '%'))
 885                         return -EINVAL;
 886
 887                 /* Use one page as a bit array of possible slots */
 888                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 889                 if (!inuse)
 890                         return -ENOMEM;
 891
 892                 for_each_netdev(net, d) {
 893                         if (!sscanf(d->name, name, &i))
 894                                 continue;
 895                         if (i < 0 || i >= max_netdevices)
 896                                 continue;
 897
 898                         /*  avoid cases where sscanf is not exact inverse of printf */
 899                         snprintf(buf, IFNAMSIZ, name, i);
 900                         if (!strncmp(buf, d->name, IFNAMSIZ))
 901                                 set_bit(i, inuse);
 902                 }
 903
 904                 i = find_first_zero_bit(inuse, max_netdevices);
 905                 free_page((unsigned long) inuse);
 906         }
 907
 908         if (buf != name)
 909                 snprintf(buf, IFNAMSIZ, name, i);
 910         if (!__dev_get_by_name(net, buf))
 911                 return i;
 912
 913         /* It is possible to run out of possible slots
 914          * when the name is long and there isn't enough space left
 915          * for the digits, or if all bits are used.
 916          */
 917         return -ENFILE;
 918 }
 919
 920 /**
 921  *      dev_alloc_name - allocate a name for a device
 922  *      @dev: device
 923  *      @name: name format string
 924  *
 925  *      Passed a format string - eg "lt%d" it will try and find a suitable
 926  *      id. It scans list of devices to build up a free map, then chooses
 927  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 928  *      while allocating the name and adding the device in order to avoid
 929  *      duplicates.
 930  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 931  *      Returns the number of the unit assigned or a negative errno code.
 932  */
 933
 934 int dev_alloc_name(struct net_device *dev, const char *name)
 935 {
 936         char buf[IFNAMSIZ];
 937         struct net *net;
 938         int ret;
 939
 940         BUG_ON(!dev_net(dev));
 941         net = dev_net(dev);
 942         ret = __dev_alloc_name(net, name, buf);
 943         if (ret >= 0)
 944                 strlcpy(dev->name, buf, IFNAMSIZ);
 945         return ret;
 946 }
 947 EXPORT_SYMBOL(dev_alloc_name);
 948
 949 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 950 {
 951         struct net *net;
 952
 953         BUG_ON(!dev_net(dev));
 954         net = dev_net(dev);
 955
 956         if (!dev_valid_name(name))
 957                 return -EINVAL;
 958
 959         if (fmt && strchr(name, '%'))
 960                 return dev_alloc_name(dev, name);
 961         else if (__dev_get_by_name(net, name))
 962                 return -EEXIST;
 963         else if (dev->name != name)
 964                 strlcpy(dev->name, name, IFNAMSIZ);
 965
 966         return 0;
 967 }
 968
 969 /**
 970  *      dev_change_name - change name of a device
 971  *      @dev: device
 972  *      @newname: name (or format string) must be at least IFNAMSIZ
 973  *
 974  *      Change name of a device, can pass format strings "eth%d".
 975  *      for wildcarding.
 976  */
 977 int dev_change_name(struct net_device *dev, const char *newname)
 978 {
 979         char oldname[IFNAMSIZ];
 980         int err = 0;
 981         int ret;
 982         struct net *net;
 983
 984         ASSERT_RTNL();
 985         BUG_ON(!dev_net(dev));
 986
 987         net = dev_net(dev);
 988         if (dev->flags & IFF_UP)
 989                 return -EBUSY;
 990
 991         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 992                 return 0;
 993
 994         memcpy(oldname, dev->name, IFNAMSIZ);
 995
 996         err = dev_get_valid_name(dev, newname, 1);
 997         if (err < 0)
 998                 return err;
 999
1000 rollback:
1001         ret = device_rename(&dev->dev, dev->name);
1002         if (ret) {
1003                 memcpy(dev->name, oldname, IFNAMSIZ);
1004                 return ret;
1005         }
1006
1007         write_lock_bh(&dev_base_lock);
1008         hlist_del(&dev->name_hlist);
1009         write_unlock_bh(&dev_base_lock);
1010
1011         synchronize_rcu();
1012
1013         write_lock_bh(&dev_base_lock);
1014         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1015         write_unlock_bh(&dev_base_lock);
1016
1017         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1018         ret = notifier_to_errno(ret);
1019
1020         if (ret) {
1021                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1022                 if (err >= 0) {
1023                         err = ret;
1024                         memcpy(dev->name, oldname, IFNAMSIZ);
1025                         goto rollback;
1026                 } else {
1027                         printk(KERN_ERR
1028                                "%s: name change rollback failed: %d.\n",
1029                                dev->name, ret);
1030                 }
1031         }
1032
1033         return err;
1034 }
1035
1036 /**
1037  *      dev_set_alias - change ifalias of a device
1038  *      @dev: device
1039  *      @alias: name up to IFALIASZ
1040  *      @len: limit of bytes to copy from info
1041  *
1042  *      Set ifalias for a device,
1043  */
1044 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045 {
1046         ASSERT_RTNL();
1047
1048         if (len >= IFALIASZ)
1049                 return -EINVAL;
1050
1051         if (!len) {
1052                 if (dev->ifalias) {
1053                         kfree(dev->ifalias);
1054                         dev->ifalias = NULL;
1055                 }
1056                 return 0;
1057         }
1058
1059         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1060         if (!dev->ifalias)
1061                 return -ENOMEM;
1062
1063         strlcpy(dev->ifalias, alias, len+1);
1064         return len;
1065 }
1066
1067
1068 /**
1069  *      netdev_features_change - device changes features
1070  *      @dev: device to cause notification
1071  *
1072  *      Called to indicate a device has changed features.
1073  */
1074 void netdev_features_change(struct net_device *dev)
1075 {
1076         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1077 }
1078 EXPORT_SYMBOL(netdev_features_change);
1079
1080 /**
1081  *      netdev_state_change - device changes state
1082  *      @dev: device to cause notification
1083  *
1084  *      Called to indicate a device has changed state. This function calls
1085  *      the notifier chains for netdev_chain and sends a NEWLINK message
1086  *      to the routing socket.
1087  */
1088 void netdev_state_change(struct net_device *dev)
1089 {
1090         if (dev->flags & IFF_UP) {
1091                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1092                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093         }
1094 }
1095 EXPORT_SYMBOL(netdev_state_change);
1096
1097 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1098 {
1099         return call_netdevice_notifiers(event, dev);
1100 }
1101 EXPORT_SYMBOL(netdev_bonding_change);
1102
1103 /**
1104  *      dev_load        - load a network module
1105  *      @net: the applicable net namespace
1106  *      @name: name of interface
1107  *
1108  *      If a network interface is not present and the process has suitable
1109  *      privileges this function loads the module. If module loading is not
1110  *      available in this kernel then it becomes a nop.
1111  */
1112
1113 void dev_load(struct net *net, const char *name)
1114 {
1115         struct net_device *dev;
1116
1117         rcu_read_lock();
1118         dev = dev_get_by_name_rcu(net, name);
1119         rcu_read_unlock();
1120
1121         if (!dev && capable(CAP_NET_ADMIN))
1122                 request_module("%s", name);
1123 }
1124 EXPORT_SYMBOL(dev_load);
1125
1126 static int __dev_open(struct net_device *dev)
1127 {
1128         const struct net_device_ops *ops = dev->netdev_ops;
1129         int ret;
1130
1131         ASSERT_RTNL();
1132
1133         /*
1134          *      Is it even present?
1135          */
1136         if (!netif_device_present(dev))
1137                 return -ENODEV;
1138
1139         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140         ret = notifier_to_errno(ret);
1141         if (ret)
1142                 return ret;
1143
1144         /*
1145          *      Call device private open method
1146          */
1147         set_bit(__LINK_STATE_START, &dev->state);
1148
1149         if (ops->ndo_validate_addr)
1150                 ret = ops->ndo_validate_addr(dev);
1151
1152         if (!ret && ops->ndo_open)
1153                 ret = ops->ndo_open(dev);
1154
1155         /*
1156          *      If it went open OK then:
1157          */
1158
1159         if (ret)
1160                 clear_bit(__LINK_STATE_START, &dev->state);
1161         else {
1162                 /*
1163                  *      Set the flags.
1164                  */
1165                 dev->flags |= IFF_UP;
1166
1167                 /*
1168                  *      Enable NET_DMA
1169                  */
1170                 net_dmaengine_get();
1171
1172                 /*
1173                  *      Initialize multicasting status
1174                  */
1175                 dev_set_rx_mode(dev);
1176
1177                 /*
1178                  *      Wakeup transmit queue engine
1179                  */
1180                 dev_activate(dev);
1181         }
1182
1183         return ret;
1184 }
1185
1186 /**
1187  *      dev_open        - prepare an interface for use.
1188  *      @dev:   device to open
1189  *
1190  *      Takes a device from down to up state. The device's private open
1191  *      function is invoked and then the multicast lists are loaded. Finally
1192  *      the device is moved into the up state and a %NETDEV_UP message is
1193  *      sent to the netdev notifier chain.
1194  *
1195  *      Calling this function on an active interface is a nop. On a failure
1196  *      a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200         int ret;
1201
1202         /*
1203          *      Is it already up?
1204          */
1205         if (dev->flags & IFF_UP)
1206                 return 0;
1207
1208         /*
1209          *      Open device
1210          */
1211         ret = __dev_open(dev);
1212         if (ret < 0)
1213                 return ret;
1214
1215         /*
1216          *      ... and announce new interface.
1217          */
1218         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219         call_netdevice_notifiers(NETDEV_UP, dev);
1220
1221         return ret;
1222 }
1223 EXPORT_SYMBOL(dev_open);
1224
1225 static int __dev_close_many(struct list_head *head)
1226 {
1227         struct net_device *dev;
1228
1229         ASSERT_RTNL();
1230         might_sleep();
1231
1232         list_for_each_entry(dev, head, unreg_list) {
1233                 /*
1234                  *      Tell people we are going down, so that they can
1235                  *      prepare to death, when device is still operating.
1236                  */
1237                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1238
1239                 clear_bit(__LINK_STATE_START, &dev->state);
1240
1241                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1242                  * can be even on different cpu. So just clear netif_running().
1243                  *
1244                  * dev->stop() will invoke napi_disable() on all of it's
1245                  * napi_struct instances on this device.
1246                  */
1247                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248         }
1249
1250         dev_deactivate_many(head);
1251
1252         list_for_each_entry(dev, head, unreg_list) {
1253                 const struct net_device_ops *ops = dev->netdev_ops;
1254
1255                 /*
1256                  *      Call the device specific close. This cannot fail.
1257                  *      Only if device is UP
1258                  *
1259                  *      We allow it to be called even after a DETACH hot-plug
1260                  *      event.
1261                  */
1262                 if (ops->ndo_stop)
1263                         ops->ndo_stop(dev);
1264
1265                 /*
1266                  *      Device is now down.
1267                  */
1268
1269                 dev->flags &= ~IFF_UP;
1270
1271                 /*
1272                  *      Shutdown NET_DMA
1273                  */
1274                 net_dmaengine_put();
1275         }
1276
1277         return 0;
1278 }
1279
1280 static int __dev_close(struct net_device *dev)
1281 {
1282         LIST_HEAD(single);
1283
1284         list_add(&dev->unreg_list, &single);
1285         return __dev_close_many(&single);
1286 }
1287
1288 int dev_close_many(struct list_head *head)
1289 {
1290         struct net_device *dev, *tmp;
1291         LIST_HEAD(tmp_list);
1292
1293         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1294                 if (!(dev->flags & IFF_UP))
1295                         list_move(&dev->unreg_list, &tmp_list);
1296
1297         __dev_close_many(head);
1298
1299         /*
1300          * Tell people we are down
1301          */
1302         list_for_each_entry(dev, head, unreg_list) {
1303                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1304                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1305         }
1306
1307         /* rollback_registered_many needs the complete original list */
1308         list_splice(&tmp_list, head);
1309         return 0;
1310 }
1311
1312 /**
1313  *      dev_close - shutdown an interface.
1314  *      @dev: device to shutdown
1315  *
1316  *      This function moves an active device into down state. A
1317  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1318  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1319  *      chain.
1320  */
1321 int dev_close(struct net_device *dev)
1322 {
1323         LIST_HEAD(single);
1324
1325         list_add(&dev->unreg_list, &single);
1326         dev_close_many(&single);
1327
1328         return 0;
1329 }
1330 EXPORT_SYMBOL(dev_close);
1331
1332
1333 /**
1334  *      dev_disable_lro - disable Large Receive Offload on a device
1335  *      @dev: device
1336  *
1337  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1338  *      called under RTNL.  This is needed if received packets may be
1339  *      forwarded to another interface.
1340  */
1341 void dev_disable_lro(struct net_device *dev)
1342 {
1343         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1344             dev->ethtool_ops->set_flags) {
1345                 u32 flags = dev->ethtool_ops->get_flags(dev);
1346                 if (flags & ETH_FLAG_LRO) {
1347                         flags &= ~ETH_FLAG_LRO;
1348                         dev->ethtool_ops->set_flags(dev, flags);
1349                 }
1350         }
1351         WARN_ON(dev->features & NETIF_F_LRO);
1352 }
1353 EXPORT_SYMBOL(dev_disable_lro);
1354
1355
1356 static int dev_boot_phase = 1;
1357
1358 /*
1359  *      Device change register/unregister. These are not inline or static
1360  *      as we export them to the world.
1361  */
1362
1363 /**
1364  *      register_netdevice_notifier - register a network notifier block
1365  *      @nb: notifier
1366  *
1367  *      Register a notifier to be called when network device events occur.
1368  *      The notifier passed is linked into the kernel structures and must
1369  *      not be reused until it has been unregistered. A negative errno code
1370  *      is returned on a failure.
1371  *
1372  *      When registered all registration and up events are replayed
1373  *      to the new notifier to allow device to have a race free
1374  *      view of the network device list.
1375  */
1376
1377 int register_netdevice_notifier(struct notifier_block *nb)
1378 {
1379         struct net_device *dev;
1380         struct net_device *last;
1381         struct net *net;
1382         int err;
1383
1384         rtnl_lock();
1385         err = raw_notifier_chain_register(&netdev_chain, nb);
1386         if (err)
1387                 goto unlock;
1388         if (dev_boot_phase)
1389                 goto unlock;
1390         for_each_net(net) {
1391                 for_each_netdev(net, dev) {
1392                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1393                         err = notifier_to_errno(err);
1394                         if (err)
1395                                 goto rollback;
1396
1397                         if (!(dev->flags & IFF_UP))
1398                                 continue;
1399
1400                         nb->notifier_call(nb, NETDEV_UP, dev);
1401                 }
1402         }
1403
1404 unlock:
1405         rtnl_unlock();
1406         return err;
1407
1408 rollback:
1409         last = dev;
1410         for_each_net(net) {
1411                 for_each_netdev(net, dev) {
1412                         if (dev == last)
1413                                 break;
1414
1415                         if (dev->flags & IFF_UP) {
1416                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1417                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1418                         }
1419                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1420                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1421                 }
1422         }
1423
1424         raw_notifier_chain_unregister(&netdev_chain, nb);
1425         goto unlock;
1426 }
1427 EXPORT_SYMBOL(register_netdevice_notifier);
1428
1429 /**
1430  *      unregister_netdevice_notifier - unregister a network notifier block
1431  *      @nb: notifier
1432  *
1433  *      Unregister a notifier previously registered by
1434  *      register_netdevice_notifier(). The notifier is unlinked into the
1435  *      kernel structures and may then be reused. A negative errno code
1436  *      is returned on a failure.
1437  */
1438
1439 int unregister_netdevice_notifier(struct notifier_block *nb)
1440 {
1441         int err;
1442
1443         rtnl_lock();
1444         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1445         rtnl_unlock();
1446         return err;
1447 }
1448 EXPORT_SYMBOL(unregister_netdevice_notifier);
1449
1450 /**
1451  *      call_netdevice_notifiers - call all network notifier blocks
1452  *      @val: value passed unmodified to notifier function
1453  *      @dev: net_device pointer passed unmodified to notifier function
1454  *
1455  *      Call all network notifier blocks.  Parameters and return value
1456  *      are as for raw_notifier_call_chain().
1457  */
1458
1459 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1460 {
1461         ASSERT_RTNL();
1462         return raw_notifier_call_chain(&netdev_chain, val, dev);
1463 }
1464
1465 /* When > 0 there are consumers of rx skb time stamps */
1466 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1467
1468 void net_enable_timestamp(void)
1469 {
1470         atomic_inc(&netstamp_needed);
1471 }
1472 EXPORT_SYMBOL(net_enable_timestamp);
1473
1474 void net_disable_timestamp(void)
1475 {
1476         atomic_dec(&netstamp_needed);
1477 }
1478 EXPORT_SYMBOL(net_disable_timestamp);
1479
1480 static inline void net_timestamp_set(struct sk_buff *skb)
1481 {
1482         if (atomic_read(&netstamp_needed))
1483                 __net_timestamp(skb);
1484         else
1485                 skb->tstamp.tv64 = 0;
1486 }
1487
1488 static inline void net_timestamp_check(struct sk_buff *skb)
1489 {
1490         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1491                 __net_timestamp(skb);
1492 }
1493
1494 /**
1495  * dev_forward_skb - loopback an skb to another netif
1496  *
1497  * @dev: destination network device
1498  * @skb: buffer to forward
1499  *
1500  * return values:
1501  *      NET_RX_SUCCESS  (no congestion)
1502  *      NET_RX_DROP     (packet was dropped, but freed)
1503  *
1504  * dev_forward_skb can be used for injecting an skb from the
1505  * start_xmit function of one device into the receive queue
1506  * of another device.
1507  *
1508  * The receiving device may be in another namespace, so
1509  * we have to clear all information in the skb that could
1510  * impact namespace isolation.
1511  */
1512 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1513 {
1514         skb_orphan(skb);
1515         nf_reset(skb);
1516
1517         if (unlikely(!(dev->flags & IFF_UP) ||
1518                      (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1519                 atomic_long_inc(&dev->rx_dropped);
1520                 kfree_skb(skb);
1521                 return NET_RX_DROP;
1522         }
1523         skb_set_dev(skb, dev);
1524         skb->tstamp.tv64 = 0;
1525         skb->pkt_type = PACKET_HOST;
1526         skb->protocol = eth_type_trans(skb, dev);
1527         return netif_rx(skb);
1528 }
1529 EXPORT_SYMBOL_GPL(dev_forward_skb);
1530
1531 static inline int deliver_skb(struct sk_buff *skb,
1532                               struct packet_type *pt_prev,
1533                               struct net_device *orig_dev)
1534 {
1535         atomic_inc(&skb->users);
1536         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1537 }
1538
1539 /*
1540  *      Support routine. Sends outgoing frames to any network
1541  *      taps currently in use.
1542  */
1543
1544 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1545 {
1546         struct packet_type *ptype;
1547         struct sk_buff *skb2 = NULL;
1548         struct packet_type *pt_prev = NULL;
1549
1550         rcu_read_lock();
1551         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1552                 /* Never send packets back to the socket
1553                  * they originated from - MvS (miquels@drinkel.ow.org)
1554                  */
1555                 if ((ptype->dev == dev || !ptype->dev) &&
1556                     (ptype->af_packet_priv == NULL ||
1557                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1558                         if (pt_prev) {
1559                                 deliver_skb(skb2, pt_prev, skb->dev);
1560                                 pt_prev = ptype;
1561                                 continue;
1562                         }
1563
1564                         skb2 = skb_clone(skb, GFP_ATOMIC);
1565                         if (!skb2)
1566                                 break;
1567
1568                         net_timestamp_set(skb2);
1569
1570                         /* skb->nh should be correctly
1571                            set by sender, so that the second statement is
1572                            just protection against buggy protocols.
1573                          */
1574                         skb_reset_mac_header(skb2);
1575
1576                         if (skb_network_header(skb2) < skb2->data ||
1577                             skb2->network_header > skb2->tail) {
1578                                 if (net_ratelimit())
1579                                         printk(KERN_CRIT "protocol %04x is "
1580                                                "buggy, dev %s\n",
1581                                                ntohs(skb2->protocol),
1582                                                dev->name);
1583                                 skb_reset_network_header(skb2);
1584                         }
1585
1586                         skb2->transport_header = skb2->network_header;
1587                         skb2->pkt_type = PACKET_OUTGOING;
1588                         pt_prev = ptype;
1589                 }
1590         }
1591         if (pt_prev)
1592                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1593         rcu_read_unlock();
1594 }
1595
1596 /*
1597  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1598  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1599  */
1600 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1601 {
1602         int rc;
1603
1604         if (txq < 1 || txq > dev->num_tx_queues)
1605                 return -EINVAL;
1606
1607         if (dev->reg_state == NETREG_REGISTERED) {
1608                 ASSERT_RTNL();
1609
1610                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1611                                                   txq);
1612                 if (rc)
1613                         return rc;
1614
1615                 if (txq < dev->real_num_tx_queues)
1616                         qdisc_reset_all_tx_gt(dev, txq);
1617         }
1618
1619         dev->real_num_tx_queues = txq;
1620         return 0;
1621 }
1622 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1623
1624 #ifdef CONFIG_RPS
1625 /**
1626  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1627  *      @dev: Network device
1628  *      @rxq: Actual number of RX queues
1629  *
1630  *      This must be called either with the rtnl_lock held or before
1631  *      registration of the net device.  Returns 0 on success, or a
1632  *      negative error code.  If called before registration, it always
1633  *      succeeds.
1634  */
1635 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1636 {
1637         int rc;
1638
1639         if (rxq < 1 || rxq > dev->num_rx_queues)
1640                 return -EINVAL;
1641
1642         if (dev->reg_state == NETREG_REGISTERED) {
1643                 ASSERT_RTNL();
1644
1645                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1646                                                   rxq);
1647                 if (rc)
1648                         return rc;
1649         }
1650
1651         dev->real_num_rx_queues = rxq;
1652         return 0;
1653 }
1654 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1655 #endif
1656
1657 static inline void __netif_reschedule(struct Qdisc *q)
1658 {
1659         struct softnet_data *sd;
1660         unsigned long flags;
1661
1662         local_irq_save(flags);
1663         sd = &__get_cpu_var(softnet_data);
1664         q->next_sched = NULL;
1665         *sd->output_queue_tailp = q;
1666         sd->output_queue_tailp = &q->next_sched;
1667         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1668         local_irq_restore(flags);
1669 }
1670
1671 void __netif_schedule(struct Qdisc *q)
1672 {
1673         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1674                 __netif_reschedule(q);
1675 }
1676 EXPORT_SYMBOL(__netif_schedule);
1677
1678 void dev_kfree_skb_irq(struct sk_buff *skb)
1679 {
1680         if (atomic_dec_and_test(&skb->users)) {
1681                 struct softnet_data *sd;
1682                 unsigned long flags;
1683
1684                 local_irq_save(flags);
1685                 sd = &__get_cpu_var(softnet_data);
1686                 skb->next = sd->completion_queue;
1687                 sd->completion_queue = skb;
1688                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1689                 local_irq_restore(flags);
1690         }
1691 }
1692 EXPORT_SYMBOL(dev_kfree_skb_irq);
1693
1694 void dev_kfree_skb_any(struct sk_buff *skb)
1695 {
1696         if (in_irq() || irqs_disabled())
1697                 dev_kfree_skb_irq(skb);
1698         else
1699                 dev_kfree_skb(skb);
1700 }
1701 EXPORT_SYMBOL(dev_kfree_skb_any);
1702
1703
1704 /**
1705  * netif_device_detach - mark device as removed
1706  * @dev: network device
1707  *
1708  * Mark device as removed from system and therefore no longer available.
1709  */
1710 void netif_device_detach(struct net_device *dev)
1711 {
1712         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1713             netif_running(dev)) {
1714                 netif_tx_stop_all_queues(dev);
1715         }
1716 }
1717 EXPORT_SYMBOL(netif_device_detach);
1718
1719 /**
1720  * netif_device_attach - mark device as attached
1721  * @dev: network device
1722  *
1723  * Mark device as attached from system and restart if needed.
1724  */
1725 void netif_device_attach(struct net_device *dev)
1726 {
1727         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1728             netif_running(dev)) {
1729                 netif_tx_wake_all_queues(dev);
1730                 __netdev_watchdog_up(dev);
1731         }
1732 }
1733 EXPORT_SYMBOL(netif_device_attach);
1734
1735 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1736 {
1737         return ((features & NETIF_F_GEN_CSUM) ||
1738                 ((features & NETIF_F_V4_CSUM) &&
1739                  protocol == htons(ETH_P_IP)) ||
1740                 ((features & NETIF_F_V6_CSUM) &&
1741                  protocol == htons(ETH_P_IPV6)) ||
1742                 ((features & NETIF_F_FCOE_CRC) &&
1743                  protocol == htons(ETH_P_FCOE)));
1744 }
1745
1746 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1747 {
1748         __be16 protocol = skb->protocol;
1749         int features = dev->features;
1750
1751         if (vlan_tx_tag_present(skb)) {
1752                 features &= dev->vlan_features;
1753         } else if (protocol == htons(ETH_P_8021Q)) {
1754                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1755                 protocol = veh->h_vlan_encapsulated_proto;
1756                 features &= dev->vlan_features;
1757         }
1758
1759         return can_checksum_protocol(features, protocol);
1760 }
1761
1762 /**
1763  * skb_dev_set -- assign a new device to a buffer
1764  * @skb: buffer for the new device
1765  * @dev: network device
1766  *
1767  * If an skb is owned by a device already, we have to reset
1768  * all data private to the namespace a device belongs to
1769  * before assigning it a new device.
1770  */
1771 #ifdef CONFIG_NET_NS
1772 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1773 {
1774         skb_dst_drop(skb);
1775         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1776                 secpath_reset(skb);
1777                 nf_reset(skb);
1778                 skb_init_secmark(skb);
1779                 skb->mark = 0;
1780                 skb->priority = 0;
1781                 skb->nf_trace = 0;
1782                 skb->ipvs_property = 0;
1783 #ifdef CONFIG_NET_SCHED
1784                 skb->tc_index = 0;
1785 #endif
1786         }
1787         skb->dev = dev;
1788 }
1789 EXPORT_SYMBOL(skb_set_dev);
1790 #endif /* CONFIG_NET_NS */
1791
1792 /*
1793  * Invalidate hardware checksum when packet is to be mangled, and
1794  * complete checksum manually on outgoing path.
1795  */
1796 int skb_checksum_help(struct sk_buff *skb)
1797 {
1798         __wsum csum;
1799         int ret = 0, offset;
1800
1801         if (skb->ip_summed == CHECKSUM_COMPLETE)
1802                 goto out_set_summed;
1803
1804         if (unlikely(skb_shinfo(skb)->gso_size)) {
1805                 /* Let GSO fix up the checksum. */
1806                 goto out_set_summed;
1807         }
1808
1809         offset = skb_checksum_start_offset(skb);
1810         BUG_ON(offset >= skb_headlen(skb));
1811         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1812
1813         offset += skb->csum_offset;
1814         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1815
1816         if (skb_cloned(skb) &&
1817             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1818                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1819                 if (ret)
1820                         goto out;
1821         }
1822
1823         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1824 out_set_summed:
1825         skb->ip_summed = CHECKSUM_NONE;
1826 out:
1827         return ret;
1828 }
1829 EXPORT_SYMBOL(skb_checksum_help);
1830
1831 /**
1832  *      skb_gso_segment - Perform segmentation on skb.
1833  *      @skb: buffer to segment
1834  *      @features: features for the output path (see dev->features)
1835  *
1836  *      This function segments the given skb and returns a list of segments.
1837  *
1838  *      It may return NULL if the skb requires no segmentation.  This is
1839  *      only possible when GSO is used for verifying header integrity.
1840  */
1841 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1842 {
1843         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1844         struct packet_type *ptype;
1845         __be16 type = skb->protocol;
1846         int vlan_depth = ETH_HLEN;
1847         int err;
1848
1849         while (type == htons(ETH_P_8021Q)) {
1850                 struct vlan_hdr *vh;
1851
1852                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1853                         return ERR_PTR(-EINVAL);
1854
1855                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1856                 type = vh->h_vlan_encapsulated_proto;
1857                 vlan_depth += VLAN_HLEN;
1858         }
1859
1860         skb_reset_mac_header(skb);
1861         skb->mac_len = skb->network_header - skb->mac_header;
1862         __skb_pull(skb, skb->mac_len);
1863
1864         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1865                 struct net_device *dev = skb->dev;
1866                 struct ethtool_drvinfo info = {};
1867
1868                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1869                         dev->ethtool_ops->get_drvinfo(dev, &info);
1870
1871                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1872                      info.driver, dev ? dev->features : 0L,
1873                      skb->sk ? skb->sk->sk_route_caps : 0L,
1874                      skb->len, skb->data_len, skb->ip_summed);
1875
1876                 if (skb_header_cloned(skb) &&
1877                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1878                         return ERR_PTR(err);
1879         }
1880
1881         rcu_read_lock();
1882         list_for_each_entry_rcu(ptype,
1883                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1884                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1885                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1886                                 err = ptype->gso_send_check(skb);
1887                                 segs = ERR_PTR(err);
1888                                 if (err || skb_gso_ok(skb, features))
1889                                         break;
1890                                 __skb_push(skb, (skb->data -
1891                                                  skb_network_header(skb)));
1892                         }
1893                         segs = ptype->gso_segment(skb, features);
1894                         break;
1895                 }
1896         }
1897         rcu_read_unlock();
1898
1899         __skb_push(skb, skb->data - skb_mac_header(skb));
1900
1901         return segs;
1902 }
1903 EXPORT_SYMBOL(skb_gso_segment);
1904
1905 /* Take action when hardware reception checksum errors are detected. */
1906 #ifdef CONFIG_BUG
1907 void netdev_rx_csum_fault(struct net_device *dev)
1908 {
1909         if (net_ratelimit()) {
1910                 printk(KERN_ERR "%s: hw csum failure.\n",
1911                         dev ? dev->name : "<unknown>");
1912                 dump_stack();
1913         }
1914 }
1915 EXPORT_SYMBOL(netdev_rx_csum_fault);
1916 #endif
1917
1918 /* Actually, we should eliminate this check as soon as we know, that:
1919  * 1. IOMMU is present and allows to map all the memory.
1920  * 2. No high memory really exists on this machine.
1921  */
1922
1923 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1924 {
1925 #ifdef CONFIG_HIGHMEM
1926         int i;
1927         if (!(dev->features & NETIF_F_HIGHDMA)) {
1928                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1929                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1930                                 return 1;
1931         }
1932
1933         if (PCI_DMA_BUS_IS_PHYS) {
1934                 struct device *pdev = dev->dev.parent;
1935
1936                 if (!pdev)
1937                         return 0;
1938                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1939                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1940                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1941                                 return 1;
1942                 }
1943         }
1944 #endif
1945         return 0;
1946 }
1947
1948 struct dev_gso_cb {
1949         void (*destructor)(struct sk_buff *skb);
1950 };
1951
1952 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1953
1954 static void dev_gso_skb_destructor(struct sk_buff *skb)
1955 {
1956         struct dev_gso_cb *cb;
1957
1958         do {
1959                 struct sk_buff *nskb = skb->next;
1960
1961                 skb->next = nskb->next;
1962                 nskb->next = NULL;
1963                 kfree_skb(nskb);
1964         } while (skb->next);
1965
1966         cb = DEV_GSO_CB(skb);
1967         if (cb->destructor)
1968                 cb->destructor(skb);
1969 }
1970
1971 /**
1972  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1973  *      @skb: buffer to segment
1974  *
1975  *      This function segments the given skb and stores the list of segments
1976  *      in skb->next.
1977  */
1978 static int dev_gso_segment(struct sk_buff *skb)
1979 {
1980         struct net_device *dev = skb->dev;
1981         struct sk_buff *segs;
1982         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1983                                          NETIF_F_SG : 0);
1984
1985         segs = skb_gso_segment(skb, features);
1986
1987         /* Verifying header integrity only. */
1988         if (!segs)
1989                 return 0;
1990
1991         if (IS_ERR(segs))
1992                 return PTR_ERR(segs);
1993
1994         skb->next = segs;
1995         DEV_GSO_CB(skb)->destructor = skb->destructor;
1996         skb->destructor = dev_gso_skb_destructor;
1997
1998         return 0;
1999 }
2000
2001 /*
2002  * Try to orphan skb early, right before transmission by the device.
2003  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2004  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2005  */
2006 static inline void skb_orphan_try(struct sk_buff *skb)
2007 {
2008         struct sock *sk = skb->sk;
2009
2010         if (sk && !skb_shinfo(skb)->tx_flags) {
2011                 /* skb_tx_hash() wont be able to get sk.
2012                  * We copy sk_hash into skb->rxhash
2013                  */
2014                 if (!skb->rxhash)
2015                         skb->rxhash = sk->sk_hash;
2016                 skb_orphan(skb);
2017         }
2018 }
2019
2020 static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2021 {
2022         if (!can_checksum_protocol(protocol, features)) {
2023                 features &= ~NETIF_F_ALL_CSUM;
2024                 features &= ~NETIF_F_SG;
2025         } else if (illegal_highdma(skb->dev, skb)) {
2026                 features &= ~NETIF_F_SG;
2027         }
2028
2029         return features;
2030 }
2031
2032 int netif_skb_features(struct sk_buff *skb)
2033 {
2034         __be16 protocol = skb->protocol;
2035         int features = skb->dev->features;
2036
2037         if (protocol == htons(ETH_P_8021Q)) {
2038                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2039                 protocol = veh->h_vlan_encapsulated_proto;
2040         } else if (!vlan_tx_tag_present(skb)) {
2041                 return harmonize_features(skb, protocol, features);
2042         }
2043
2044         features &= skb->dev->vlan_features;
2045
2046         if (protocol != htons(ETH_P_8021Q)) {
2047                 return harmonize_features(skb, protocol, features);
2048         } else {
2049                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2050                                 NETIF_F_GEN_CSUM;
2051                 return harmonize_features(skb, protocol, features);
2052         }
2053 }
2054 EXPORT_SYMBOL(netif_skb_features);
2055
2056 /*
2057  * Returns true if either:
2058  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2059  *      2. skb is fragmented and the device does not support SG, or if
2060  *         at least one of fragments is in highmem and device does not
2061  *         support DMA from it.
2062  */
2063 static inline int skb_needs_linearize(struct sk_buff *skb,
2064                                       struct net_device *dev)
2065 {
2066         if (skb_is_nonlinear(skb)) {
2067                 int features = dev->features;
2068
2069                 if (vlan_tx_tag_present(skb))
2070                         features &= dev->vlan_features;
2071
2072                 return (skb_has_frag_list(skb) &&
2073                         !(features & NETIF_F_FRAGLIST)) ||
2074                         (skb_shinfo(skb)->nr_frags &&
2075                         (!(features & NETIF_F_SG) ||
2076                         illegal_highdma(dev, skb)));
2077         }
2078
2079         return 0;
2080 }
2081
2082 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2083                         struct netdev_queue *txq)
2084 {
2085         const struct net_device_ops *ops = dev->netdev_ops;
2086         int rc = NETDEV_TX_OK;
2087
2088         if (likely(!skb->next)) {
2089                 /*
2090                  * If device doesnt need skb->dst, release it right now while
2091                  * its hot in this cpu cache
2092                  */
2093                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2094                         skb_dst_drop(skb);
2095
2096                 if (!list_empty(&ptype_all))
2097                         dev_queue_xmit_nit(skb, dev);
2098
2099                 skb_orphan_try(skb);
2100
2101                 if (vlan_tx_tag_present(skb) &&
2102                     !(dev->features & NETIF_F_HW_VLAN_TX)) {
2103                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2104                         if (unlikely(!skb))
2105                                 goto out;
2106
2107                         skb->vlan_tci = 0;
2108                 }
2109
2110                 if (netif_needs_gso(dev, skb)) {
2111                         if (unlikely(dev_gso_segment(skb)))
2112                                 goto out_kfree_skb;
2113                         if (skb->next)
2114                                 goto gso;
2115                 } else {
2116                         if (skb_needs_linearize(skb, dev) &&
2117                             __skb_linearize(skb))
2118                                 goto out_kfree_skb;
2119
2120                         /* If packet is not checksummed and device does not
2121                          * support checksumming for this protocol, complete
2122                          * checksumming here.
2123                          */
2124                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2125                                 skb_set_transport_header(skb,
2126                                         skb_checksum_start_offset(skb));
2127                                 if (!dev_can_checksum(dev, skb) &&
2128                                      skb_checksum_help(skb))
2129                                         goto out_kfree_skb;
2130                         }
2131                 }
2132
2133                 rc = ops->ndo_start_xmit(skb, dev);
2134                 trace_net_dev_xmit(skb, rc);
2135                 if (rc == NETDEV_TX_OK)
2136                         txq_trans_update(txq);
2137                 return rc;
2138         }
2139
2140 gso:
2141         do {
2142                 struct sk_buff *nskb = skb->next;
2143
2144                 skb->next = nskb->next;
2145                 nskb->next = NULL;
2146
2147                 /*
2148                  * If device doesnt need nskb->dst, release it right now while
2149                  * its hot in this cpu cache
2150                  */
2151                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2152                         skb_dst_drop(nskb);
2153
2154                 rc = ops->ndo_start_xmit(nskb, dev);
2155                 trace_net_dev_xmit(nskb, rc);
2156                 if (unlikely(rc != NETDEV_TX_OK)) {
2157                         if (rc & ~NETDEV_TX_MASK)
2158                                 goto out_kfree_gso_skb;
2159                         nskb->next = skb->next;
2160                         skb->next = nskb;
2161                         return rc;
2162                 }
2163                 txq_trans_update(txq);
2164                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2165                         return NETDEV_TX_BUSY;
2166         } while (skb->next);
2167
2168 out_kfree_gso_skb:
2169         if (likely(skb->next == NULL))
2170                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2171 out_kfree_skb:
2172         kfree_skb(skb);
2173 out:
2174         return rc;
2175 }
2176
2177 static u32 hashrnd __read_mostly;
2178
2179 /*
2180  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2181  * to be used as a distribution range.
2182  */
2183 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2184                   unsigned int num_tx_queues)
2185 {
2186         u32 hash;
2187
2188         if (skb_rx_queue_recorded(skb)) {
2189                 hash = skb_get_rx_queue(skb);
2190                 while (unlikely(hash >= num_tx_queues))
2191                         hash -= num_tx_queues;
2192                 return hash;
2193         }
2194
2195         if (skb->sk && skb->sk->sk_hash)
2196                 hash = skb->sk->sk_hash;
2197         else
2198                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2199         hash = jhash_1word(hash, hashrnd);
2200
2201         return (u16) (((u64) hash * num_tx_queues) >> 32);
2202 }
2203 EXPORT_SYMBOL(__skb_tx_hash);
2204
2205 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2206 {
2207         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2208                 if (net_ratelimit()) {
2209                         pr_warning("%s selects TX queue %d, but "
2210                                 "real number of TX queues is %d\n",
2211                                 dev->name, queue_index, dev->real_num_tx_queues);
2212                 }
2213                 return 0;
2214         }
2215         return queue_index;
2216 }
2217
2218 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2219 {
2220 #ifdef CONFIG_XPS
2221         struct xps_dev_maps *dev_maps;
2222         struct xps_map *map;
2223         int queue_index = -1;
2224
2225         rcu_read_lock();
2226         dev_maps = rcu_dereference(dev->xps_maps);
2227         if (dev_maps) {
2228                 map = rcu_dereference(
2229                     dev_maps->cpu_map[raw_smp_processor_id()]);
2230                 if (map) {
2231                         if (map->len == 1)
2232                                 queue_index = map->queues[0];
2233                         else {
2234                                 u32 hash;
2235                                 if (skb->sk && skb->sk->sk_hash)
2236                                         hash = skb->sk->sk_hash;
2237                                 else
2238                                         hash = (__force u16) skb->protocol ^
2239                                             skb->rxhash;
2240                                 hash = jhash_1word(hash, hashrnd);
2241                                 queue_index = map->queues[
2242                                     ((u64)hash * map->len) >> 32];
2243                         }
2244                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2245                                 queue_index = -1;
2246                 }
2247         }
2248         rcu_read_unlock();
2249
2250         return queue_index;
2251 #else
2252         return -1;
2253 #endif
2254 }
2255
2256 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2257                                         struct sk_buff *skb)
2258 {
2259         int queue_index;
2260         const struct net_device_ops *ops = dev->netdev_ops;
2261
2262         if (dev->real_num_tx_queues == 1)
2263                 queue_index = 0;
2264         else if (ops->ndo_select_queue) {
2265                 queue_index = ops->ndo_select_queue(dev, skb);
2266                 queue_index = dev_cap_txqueue(dev, queue_index);
2267         } else {
2268                 struct sock *sk = skb->sk;
2269                 queue_index = sk_tx_queue_get(sk);
2270
2271                 if (queue_index < 0 || skb->ooo_okay ||
2272                     queue_index >= dev->real_num_tx_queues) {
2273                         int old_index = queue_index;
2274
2275                         queue_index = get_xps_queue(dev, skb);
2276                         if (queue_index < 0)
2277                                 queue_index = skb_tx_hash(dev, skb);
2278
2279                         if (queue_index != old_index && sk) {
2280                                 struct dst_entry *dst =
2281                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2282
2283                                 if (dst && skb_dst(skb) == dst)
2284                                         sk_tx_queue_set(sk, queue_index);
2285                         }
2286                 }
2287         }
2288
2289         skb_set_queue_mapping(skb, queue_index);
2290         return netdev_get_tx_queue(dev, queue_index);
2291 }
2292
2293 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2294                                  struct net_device *dev,
2295                                  struct netdev_queue *txq)
2296 {
2297         spinlock_t *root_lock = qdisc_lock(q);
2298         bool contended = qdisc_is_running(q);
2299         int rc;
2300
2301         /*
2302          * Heuristic to force contended enqueues to serialize on a
2303          * separate lock before trying to get qdisc main lock.
2304          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2305          * and dequeue packets faster.
2306          */
2307         if (unlikely(contended))
2308                 spin_lock(&q->busylock);
2309
2310         spin_lock(root_lock);
2311         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2312                 kfree_skb(skb);
2313                 rc = NET_XMIT_DROP;
2314         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2315                    qdisc_run_begin(q)) {
2316                 /*
2317                  * This is a work-conserving queue; there are no old skbs
2318                  * waiting to be sent out; and the qdisc is not running -
2319                  * xmit the skb directly.
2320                  */
2321                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2322                         skb_dst_force(skb);
2323                 __qdisc_update_bstats(q, skb->len);
2324                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2325                         if (unlikely(contended)) {
2326                                 spin_unlock(&q->busylock);
2327                                 contended = false;
2328                         }
2329                         __qdisc_run(q);
2330                 } else
2331                         qdisc_run_end(q);
2332
2333                 rc = NET_XMIT_SUCCESS;
2334         } else {
2335                 skb_dst_force(skb);
2336                 rc = qdisc_enqueue_root(skb, q);
2337                 if (qdisc_run_begin(q)) {
2338                         if (unlikely(contended)) {
2339                                 spin_unlock(&q->busylock);
2340                                 contended = false;
2341                         }
2342                         __qdisc_run(q);
2343                 }
2344         }
2345         spin_unlock(root_lock);
2346         if (unlikely(contended))
2347                 spin_unlock(&q->busylock);
2348         return rc;
2349 }
2350
2351 static DEFINE_PER_CPU(int, xmit_recursion);
2352 #define RECURSION_LIMIT 10
2353
2354 /**
2355  *      dev_queue_xmit - transmit a buffer
2356  *      @skb: buffer to transmit
2357  *
2358  *      Queue a buffer for transmission to a network device. The caller must
2359  *      have set the device and priority and built the buffer before calling
2360  *      this function. The function can be called from an interrupt.
2361  *
2362  *      A negative errno code is returned on a failure. A success does not
2363  *      guarantee the frame will be transmitted as it may be dropped due
2364  *      to congestion or traffic shaping.
2365  *
2366  * -----------------------------------------------------------------------------------
2367  *      I notice this method can also return errors from the queue disciplines,
2368  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2369  *      be positive.
2370  *
2371  *      Regardless of the return value, the skb is consumed, so it is currently
2372  *      difficult to retry a send to this method.  (You can bump the ref count
2373  *      before sending to hold a reference for retry if you are careful.)
2374  *
2375  *      When calling this method, interrupts MUST be enabled.  This is because
2376  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2377  *          --BLG
2378  */
2379 int dev_queue_xmit(struct sk_buff *skb)
2380 {
2381         struct net_device *dev = skb->dev;
2382         struct netdev_queue *txq;
2383         struct Qdisc *q;
2384         int rc = -ENOMEM;
2385
2386         /* Disable soft irqs for various locks below. Also
2387          * stops preemption for RCU.
2388          */
2389         rcu_read_lock_bh();
2390
2391         txq = dev_pick_tx(dev, skb);
2392         q = rcu_dereference_bh(txq->qdisc);
2393
2394 #ifdef CONFIG_NET_CLS_ACT
2395         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2396 #endif
2397         trace_net_dev_queue(skb);
2398         if (q->enqueue) {
2399                 rc = __dev_xmit_skb(skb, q, dev, txq);
2400                 goto out;
2401         }
2402
2403         /* The device has no queue. Common case for software devices:
2404            loopback, all the sorts of tunnels...
2405
2406            Really, it is unlikely that netif_tx_lock protection is necessary
2407            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2408            counters.)
2409            However, it is possible, that they rely on protection
2410            made by us here.
2411
2412            Check this and shot the lock. It is not prone from deadlocks.
2413            Either shot noqueue qdisc, it is even simpler 8)
2414          */
2415         if (dev->flags & IFF_UP) {
2416                 int cpu = smp_processor_id(); /* ok because BHs are off */
2417
2418                 if (txq->xmit_lock_owner != cpu) {
2419
2420                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2421                                 goto recursion_alert;
2422
2423                         HARD_TX_LOCK(dev, txq, cpu);
2424
2425                         if (!netif_tx_queue_stopped(txq)) {
2426                                 __this_cpu_inc(xmit_recursion);
2427                                 rc = dev_hard_start_xmit(skb, dev, txq);
2428                                 __this_cpu_dec(xmit_recursion);
2429                                 if (dev_xmit_complete(rc)) {
2430                                         HARD_TX_UNLOCK(dev, txq);
2431                                         goto out;
2432                                 }
2433                         }
2434                         HARD_TX_UNLOCK(dev, txq);
2435                         if (net_ratelimit())
2436                                 printk(KERN_CRIT "Virtual device %s asks to "
2437                                        "queue packet!\n", dev->name);
2438                 } else {
2439                         /* Recursion is detected! It is possible,
2440                          * unfortunately
2441                          */
2442 recursion_alert:
2443                         if (net_ratelimit())
2444                                 printk(KERN_CRIT "Dead loop on virtual device "
2445                                        "%s, fix it urgently!\n", dev->name);
2446                 }
2447         }
2448
2449         rc = -ENETDOWN;
2450         rcu_read_unlock_bh();
2451
2452         kfree_skb(skb);
2453         return rc;
2454 out:
2455         rcu_read_unlock_bh();
2456         return rc;
2457 }
2458 EXPORT_SYMBOL(dev_queue_xmit);
2459
2460
2461 /*=======================================================================
2462                         Receiver routines
2463   =======================================================================*/
2464
2465 int netdev_max_backlog __read_mostly = 1000;
2466 int netdev_tstamp_prequeue __read_mostly = 1;
2467 int netdev_budget __read_mostly = 300;
2468 int weight_p __read_mostly = 64;            /* old backlog weight */
2469
2470 /* Called with irq disabled */
2471 static inline void ____napi_schedule(struct softnet_data *sd,
2472                                      struct napi_struct *napi)
2473 {
2474         list_add_tail(&napi->poll_list, &sd->poll_list);
2475         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2476 }
2477
2478 /*
2479  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2480  * and src/dst port numbers. Returns a non-zero hash number on success
2481  * and 0 on failure.
2482  */
2483 __u32 __skb_get_rxhash(struct sk_buff *skb)
2484 {
2485         int nhoff, hash = 0, poff;
2486         struct ipv6hdr *ip6;
2487         struct iphdr *ip;
2488         u8 ip_proto;
2489         u32 addr1, addr2, ihl;
2490         union {
2491                 u32 v32;
2492                 u16 v16[2];
2493         } ports;
2494
2495         nhoff = skb_network_offset(skb);
2496
2497         switch (skb->protocol) {
2498         case __constant_htons(ETH_P_IP):
2499                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2500                         goto done;
2501
2502                 ip = (struct iphdr *) (skb->data + nhoff);
2503                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2504                         ip_proto = 0;
2505                 else
2506                         ip_proto = ip->protocol;
2507                 addr1 = (__force u32) ip->saddr;
2508                 addr2 = (__force u32) ip->daddr;
2509                 ihl = ip->ihl;
2510                 break;
2511         case __constant_htons(ETH_P_IPV6):
2512                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2513                         goto done;
2514
2515                 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2516                 ip_proto = ip6->nexthdr;
2517                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2518                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2519                 ihl = (40 >> 2);
2520                 break;
2521         default:
2522                 goto done;
2523         }
2524
2525         ports.v32 = 0;
2526         poff = proto_ports_offset(ip_proto);
2527         if (poff >= 0) {
2528                 nhoff += ihl * 4 + poff;
2529                 if (pskb_may_pull(skb, nhoff + 4)) {
2530                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2531                         if (ports.v16[1] < ports.v16[0])
2532                                 swap(ports.v16[0], ports.v16[1]);
2533                 }
2534         }
2535
2536         /* get a consistent hash (same value on both flow directions) */
2537         if (addr2 < addr1)
2538                 swap(addr1, addr2);
2539
2540         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2541         if (!hash)
2542                 hash = 1;
2543
2544 done:
2545         return hash;
2546 }
2547 EXPORT_SYMBOL(__skb_get_rxhash);
2548
2549 #ifdef CONFIG_RPS
2550
2551 /* One global table that all flow-based protocols share. */
2552 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2553 EXPORT_SYMBOL(rps_sock_flow_table);
2554
2555 /*
2556  * get_rps_cpu is called from netif_receive_skb and returns the target
2557  * CPU from the RPS map of the receiving queue for a given skb.
2558  * rcu_read_lock must be held on entry.
2559  */
2560 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2561                        struct rps_dev_flow **rflowp)
2562 {
2563         struct netdev_rx_queue *rxqueue;
2564         struct rps_map *map;
2565         struct rps_dev_flow_table *flow_table;
2566         struct rps_sock_flow_table *sock_flow_table;
2567         int cpu = -1;
2568         u16 tcpu;
2569
2570         if (skb_rx_queue_recorded(skb)) {
2571                 u16 index = skb_get_rx_queue(skb);
2572                 if (unlikely(index >= dev->real_num_rx_queues)) {
2573                         WARN_ONCE(dev->real_num_rx_queues > 1,
2574                                   "%s received packet on queue %u, but number "
2575                                   "of RX queues is %u\n",
2576                                   dev->name, index, dev->real_num_rx_queues);
2577                         goto done;
2578                 }
2579                 rxqueue = dev->_rx + index;
2580         } else
2581                 rxqueue = dev->_rx;
2582
2583         map = rcu_dereference(rxqueue->rps_map);
2584         if (map) {
2585                 if (map->len == 1) {
2586                         tcpu = map->cpus[0];
2587                         if (cpu_online(tcpu))
2588                                 cpu = tcpu;
2589                         goto done;
2590                 }
2591         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2592                 goto done;
2593         }
2594
2595         skb_reset_network_header(skb);
2596         if (!skb_get_rxhash(skb))
2597                 goto done;
2598
2599         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2600         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2601         if (flow_table && sock_flow_table) {
2602                 u16 next_cpu;
2603                 struct rps_dev_flow *rflow;
2604
2605                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2606                 tcpu = rflow->cpu;
2607
2608                 next_cpu = sock_flow_table->ents[skb->rxhash &
2609                     sock_flow_table->mask];
2610
2611                 /*
2612                  * If the desired CPU (where last recvmsg was done) is
2613                  * different from current CPU (one in the rx-queue flow
2614                  * table entry), switch if one of the following holds:
2615                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2616                  *   - Current CPU is offline.
2617                  *   - The current CPU's queue tail has advanced beyond the
2618                  *     last packet that was enqueued using this table entry.
2619                  *     This guarantees that all previous packets for the flow
2620                  *     have been dequeued, thus preserving in order delivery.
2621                  */
2622                 if (unlikely(tcpu != next_cpu) &&
2623                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2624                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2625                       rflow->last_qtail)) >= 0)) {
2626                         tcpu = rflow->cpu = next_cpu;
2627                         if (tcpu != RPS_NO_CPU)
2628                                 rflow->last_qtail = per_cpu(softnet_data,
2629                                     tcpu).input_queue_head;
2630                 }
2631                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2632                         *rflowp = rflow;
2633                         cpu = tcpu;
2634                         goto done;
2635                 }
2636         }
2637
2638         if (map) {
2639                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2640
2641                 if (cpu_online(tcpu)) {
2642                         cpu = tcpu;
2643                         goto done;
2644                 }
2645         }
2646
2647 done:
2648         return cpu;
2649 }
2650
2651 /* Called from hardirq (IPI) context */
2652 static void rps_trigger_softirq(void *data)
2653 {
2654         struct softnet_data *sd = data;
2655
2656         ____napi_schedule(sd, &sd->backlog);
2657         sd->received_rps++;
2658 }
2659
2660 #endif /* CONFIG_RPS */
2661
2662 /*
2663  * Check if this softnet_data structure is another cpu one
2664  * If yes, queue it to our IPI list and return 1
2665  * If no, return 0
2666  */
2667 static int rps_ipi_queued(struct softnet_data *sd)
2668 {
2669 #ifdef CONFIG_RPS
2670         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2671
2672         if (sd != mysd) {
2673                 sd->rps_ipi_next = mysd->rps_ipi_list;
2674                 mysd->rps_ipi_list = sd;
2675
2676                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2677                 return 1;
2678         }
2679 #endif /* CONFIG_RPS */
2680         return 0;
2681 }
2682
2683 /*
2684  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2685  * queue (may be a remote CPU queue).
2686  */
2687 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2688                               unsigned int *qtail)
2689 {
2690         struct softnet_data *sd;
2691         unsigned long flags;
2692
2693         sd = &per_cpu(softnet_data, cpu);
2694
2695         local_irq_save(flags);
2696
2697         rps_lock(sd);
2698         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2699                 if (skb_queue_len(&sd->input_pkt_queue)) {
2700 enqueue:
2701                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2702                         input_queue_tail_incr_save(sd, qtail);
2703                         rps_unlock(sd);
2704                         local_irq_restore(flags);
2705                         return NET_RX_SUCCESS;
2706                 }
2707
2708                 /* Schedule NAPI for backlog device
2709                  * We can use non atomic operation since we own the queue lock
2710                  */
2711                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2712                         if (!rps_ipi_queued(sd))
2713                                 ____napi_schedule(sd, &sd->backlog);
2714                 }
2715                 goto enqueue;
2716         }
2717
2718         sd->dropped++;
2719         rps_unlock(sd);
2720
2721         local_irq_restore(flags);
2722
2723         atomic_long_inc(&skb->dev->rx_dropped);
2724         kfree_skb(skb);
2725         return NET_RX_DROP;
2726 }
2727
2728 /**
2729  *      netif_rx        -       post buffer to the network code
2730  *      @skb: buffer to post
2731  *
2732  *      This function receives a packet from a device driver and queues it for
2733  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2734  *      may be dropped during processing for congestion control or by the
2735  *      protocol layers.
2736  *
2737  *      return values:
2738  *      NET_RX_SUCCESS  (no congestion)
2739  *      NET_RX_DROP     (packet was dropped)
2740  *
2741  */
2742
2743 int netif_rx(struct sk_buff *skb)
2744 {
2745         int ret;
2746
2747         /* if netpoll wants it, pretend we never saw it */
2748         if (netpoll_rx(skb))
2749                 return NET_RX_DROP;
2750
2751         if (netdev_tstamp_prequeue)
2752                 net_timestamp_check(skb);
2753
2754         trace_netif_rx(skb);
2755 #ifdef CONFIG_RPS
2756         {
2757                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2758                 int cpu;
2759
2760                 preempt_disable();
2761                 rcu_read_lock();
2762
2763                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2764                 if (cpu < 0)
2765                         cpu = smp_processor_id();
2766
2767                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2768
2769                 rcu_read_unlock();
2770                 preempt_enable();
2771         }
2772 #else
2773         {
2774                 unsigned int qtail;
2775                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2776                 put_cpu();
2777         }
2778 #endif
2779         return ret;
2780 }
2781 EXPORT_SYMBOL(netif_rx);
2782
2783 int netif_rx_ni(struct sk_buff *skb)
2784 {
2785         int err;
2786
2787         preempt_disable();
2788         err = netif_rx(skb);
2789         if (local_softirq_pending())
2790                 do_softirq();
2791         preempt_enable();
2792
2793         return err;
2794 }
2795 EXPORT_SYMBOL(netif_rx_ni);
2796
2797 static void net_tx_action(struct softirq_action *h)
2798 {
2799         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2800
2801         if (sd->completion_queue) {
2802                 struct sk_buff *clist;
2803
2804                 local_irq_disable();
2805                 clist = sd->completion_queue;
2806                 sd->completion_queue = NULL;
2807                 local_irq_enable();
2808
2809                 while (clist) {
2810                         struct sk_buff *skb = clist;
2811                         clist = clist->next;
2812
2813                         WARN_ON(atomic_read(&skb->users));
2814                         trace_kfree_skb(skb, net_tx_action);
2815                         __kfree_skb(skb);
2816                 }
2817         }
2818
2819         if (sd->output_queue) {
2820                 struct Qdisc *head;
2821
2822                 local_irq_disable();
2823                 head = sd->output_queue;
2824                 sd->output_queue = NULL;
2825                 sd->output_queue_tailp = &sd->output_queue;
2826                 local_irq_enable();
2827
2828                 while (head) {
2829                         struct Qdisc *q = head;
2830                         spinlock_t *root_lock;
2831
2832                         head = head->next_sched;
2833
2834                         root_lock = qdisc_lock(q);
2835                         if (spin_trylock(root_lock)) {
2836                                 smp_mb__before_clear_bit();
2837                                 clear_bit(__QDISC_STATE_SCHED,
2838                                           &q->state);
2839                                 qdisc_run(q);
2840                                 spin_unlock(root_lock);
2841                         } else {
2842                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2843                                               &q->state)) {
2844                                         __netif_reschedule(q);
2845                                 } else {
2846                                         smp_mb__before_clear_bit();
2847                                         clear_bit(__QDISC_STATE_SCHED,
2848                                                   &q->state);
2849                                 }
2850                         }
2851                 }
2852         }
2853 }
2854
2855 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2856     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2857 /* This hook is defined here for ATM LANE */
2858 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2859                              unsigned char *addr) __read_mostly;
2860 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2861 #endif
2862
2863 #ifdef CONFIG_NET_CLS_ACT
2864 /* TODO: Maybe we should just force sch_ingress to be compiled in
2865  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2866  * a compare and 2 stores extra right now if we dont have it on
2867  * but have CONFIG_NET_CLS_ACT
2868  * NOTE: This doesnt stop any functionality; if you dont have
2869  * the ingress scheduler, you just cant add policies on ingress.
2870  *
2871  */
2872 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2873 {
2874         struct net_device *dev = skb->dev;
2875         u32 ttl = G_TC_RTTL(skb->tc_verd);
2876         int result = TC_ACT_OK;
2877         struct Qdisc *q;
2878
2879         if (unlikely(MAX_RED_LOOP < ttl++)) {
2880                 if (net_ratelimit())
2881                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2882                                skb->skb_iif, dev->ifindex);
2883                 return TC_ACT_SHOT;
2884         }
2885
2886         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2887         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2888
2889         q = rxq->qdisc;
2890         if (q != &noop_qdisc) {
2891                 spin_lock(qdisc_lock(q));
2892                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2893                         result = qdisc_enqueue_root(skb, q);
2894                 spin_unlock(qdisc_lock(q));
2895         }
2896
2897         return result;
2898 }
2899
2900 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2901                                          struct packet_type **pt_prev,
2902                                          int *ret, struct net_device *orig_dev)
2903 {
2904         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2905
2906         if (!rxq || rxq->qdisc == &noop_qdisc)
2907                 goto out;
2908
2909         if (*pt_prev) {
2910                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2911                 *pt_prev = NULL;
2912         }
2913
2914         switch (ing_filter(skb, rxq)) {
2915         case TC_ACT_SHOT:
2916         case TC_ACT_STOLEN:
2917                 kfree_skb(skb);
2918                 return NULL;
2919         }
2920
2921 out:
2922         skb->tc_verd = 0;
2923         return skb;
2924 }
2925 #endif
2926
2927 /**
2928  *      netdev_rx_handler_register - register receive handler
2929  *      @dev: device to register a handler for
2930  *      @rx_handler: receive handler to register
2931  *      @rx_handler_data: data pointer that is used by rx handler
2932  *
2933  *      Register a receive hander for a device. This handler will then be
2934  *      called from __netif_receive_skb. A negative errno code is returned
2935  *      on a failure.
2936  *
2937  *      The caller must hold the rtnl_mutex.
2938  */
2939 int netdev_rx_handler_register(struct net_device *dev,
2940                                rx_handler_func_t *rx_handler,
2941                                void *rx_handler_data)
2942 {
2943         ASSERT_RTNL();
2944
2945         if (dev->rx_handler)
2946                 return -EBUSY;
2947
2948         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2949         rcu_assign_pointer(dev->rx_handler, rx_handler);
2950
2951         return 0;
2952 }
2953 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2954
2955 /**
2956  *      netdev_rx_handler_unregister - unregister receive handler
2957  *      @dev: device to unregister a handler from
2958  *
2959  *      Unregister a receive hander from a device.
2960  *
2961  *      The caller must hold the rtnl_mutex.
2962  */
2963 void netdev_rx_handler_unregister(struct net_device *dev)
2964 {
2965
2966         ASSERT_RTNL();
2967         rcu_assign_pointer(dev->rx_handler, NULL);
2968         rcu_assign_pointer(dev->rx_handler_data, NULL);
2969 }
2970 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2971
2972 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2973                                               struct net_device *master)
2974 {
2975         if (skb->pkt_type == PACKET_HOST) {
2976                 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2977
2978                 memcpy(dest, master->dev_addr, ETH_ALEN);
2979         }
2980 }
2981
2982 /* On bonding slaves other than the currently active slave, suppress
2983  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2984  * ARP on active-backup slaves with arp_validate enabled.
2985  */
2986 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2987 {
2988         struct net_device *dev = skb->dev;
2989
2990         if (master->priv_flags & IFF_MASTER_ARPMON)
2991                 dev->last_rx = jiffies;
2992
2993         if ((master->priv_flags & IFF_MASTER_ALB) &&
2994             (master->priv_flags & IFF_BRIDGE_PORT)) {
2995                 /* Do address unmangle. The local destination address
2996                  * will be always the one master has. Provides the right
2997                  * functionality in a bridge.
2998                  */
2999                 skb_bond_set_mac_by_master(skb, master);
3000         }
3001
3002         if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
3003                 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
3004                     skb->protocol == __cpu_to_be16(ETH_P_ARP))
3005                         return 0;
3006
3007                 if (master->priv_flags & IFF_MASTER_ALB) {
3008                         if (skb->pkt_type != PACKET_BROADCAST &&
3009                             skb->pkt_type != PACKET_MULTICAST)
3010                                 return 0;
3011                 }
3012                 if (master->priv_flags & IFF_MASTER_8023AD &&
3013                     skb->protocol == __cpu_to_be16(ETH_P_SLOW))
3014                         return 0;
3015
3016                 return 1;
3017         }
3018         return 0;
3019 }
3020 EXPORT_SYMBOL(__skb_bond_should_drop);
3021
3022 static int __netif_receive_skb(struct sk_buff *skb)
3023 {
3024         struct packet_type *ptype, *pt_prev;
3025         rx_handler_func_t *rx_handler;
3026         struct net_device *orig_dev;
3027         struct net_device *master;
3028         struct net_device *null_or_orig;
3029         struct net_device *orig_or_bond;
3030         int ret = NET_RX_DROP;
3031         __be16 type;
3032
3033         if (!netdev_tstamp_prequeue)
3034                 net_timestamp_check(skb);
3035
3036         trace_netif_receive_skb(skb);
3037
3038         /* if we've gotten here through NAPI, check netpoll */
3039         if (netpoll_receive_skb(skb))
3040                 return NET_RX_DROP;
3041
3042         if (!skb->skb_iif)
3043                 skb->skb_iif = skb->dev->ifindex;
3044
3045         /*
3046          * bonding note: skbs received on inactive slaves should only
3047          * be delivered to pkt handlers that are exact matches.  Also
3048          * the deliver_no_wcard flag will be set.  If packet handlers
3049          * are sensitive to duplicate packets these skbs will need to
3050          * be dropped at the handler.
3051          */
3052         null_or_orig = NULL;
3053         orig_dev = skb->dev;
3054         master = ACCESS_ONCE(orig_dev->master);
3055         if (skb->deliver_no_wcard)
3056                 null_or_orig = orig_dev;
3057         else if (master) {
3058                 if (skb_bond_should_drop(skb, master)) {
3059                         skb->deliver_no_wcard = 1;
3060                         null_or_orig = orig_dev; /* deliver only exact match */
3061                 } else
3062                         skb->dev = master;
3063         }
3064
3065         __this_cpu_inc(softnet_data.processed);
3066         skb_reset_network_header(skb);
3067         skb_reset_transport_header(skb);
3068         skb->mac_len = skb->network_header - skb->mac_header;
3069
3070         pt_prev = NULL;
3071
3072         rcu_read_lock();
3073
3074 #ifdef CONFIG_NET_CLS_ACT
3075         if (skb->tc_verd & TC_NCLS) {
3076                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3077                 goto ncls;
3078         }
3079 #endif
3080
3081         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3082                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3083                     ptype->dev == orig_dev) {
3084                         if (pt_prev)
3085                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3086                         pt_prev = ptype;
3087                 }
3088         }
3089
3090 #ifdef CONFIG_NET_CLS_ACT
3091         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3092         if (!skb)
3093                 goto out;
3094 ncls:
3095 #endif
3096
3097         /* Handle special case of bridge or macvlan */
3098         rx_handler = rcu_dereference(skb->dev->rx_handler);
3099         if (rx_handler) {
3100                 if (pt_prev) {
3101                         ret = deliver_skb(skb, pt_prev, orig_dev);
3102                         pt_prev = NULL;
3103                 }
3104                 skb = rx_handler(skb);
3105                 if (!skb)
3106                         goto out;
3107         }
3108
3109         if (vlan_tx_tag_present(skb)) {
3110                 if (pt_prev) {
3111                         ret = deliver_skb(skb, pt_prev, orig_dev);
3112                         pt_prev = NULL;
3113                 }
3114                 if (vlan_hwaccel_do_receive(&skb)) {
3115                         ret = __netif_receive_skb(skb);
3116                         goto out;
3117                 } else if (unlikely(!skb))
3118                         goto out;
3119         }
3120
3121         /*
3122          * Make sure frames received on VLAN interfaces stacked on
3123          * bonding interfaces still make their way to any base bonding
3124          * device that may have registered for a specific ptype.  The
3125          * handler may have to adjust skb->dev and orig_dev.
3126          */
3127         orig_or_bond = orig_dev;
3128         if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3129             (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3130                 orig_or_bond = vlan_dev_real_dev(skb->dev);
3131         }
3132
3133         type = skb->protocol;
3134         list_for_each_entry_rcu(ptype,
3135                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3136                 if (ptype->type == type && (ptype->dev == null_or_orig ||
3137                      ptype->dev == skb->dev || ptype->dev == orig_dev ||
3138                      ptype->dev == orig_or_bond)) {
3139                         if (pt_prev)
3140                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3141                         pt_prev = ptype;
3142                 }
3143         }
3144
3145         if (pt_prev) {
3146                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3147         } else {
3148                 atomic_long_inc(&skb->dev->rx_dropped);
3149                 kfree_skb(skb);
3150                 /* Jamal, now you will not able to escape explaining
3151                  * me how you were going to use this. :-)
3152                  */
3153                 ret = NET_RX_DROP;
3154         }
3155
3156 out:
3157         rcu_read_unlock();
3158         return ret;
3159 }
3160
3161 /**
3162  *      netif_receive_skb - process receive buffer from network
3163  *      @skb: buffer to process
3164  *
3165  *      netif_receive_skb() is the main receive data processing function.
3166  *      It always succeeds. The buffer may be dropped during processing
3167  *      for congestion control or by the protocol layers.
3168  *
3169  *      This function may only be called from softirq context and interrupts
3170  *      should be enabled.
3171  *
3172  *      Return values (usually ignored):
3173  *      NET_RX_SUCCESS: no congestion
3174  *      NET_RX_DROP: packet was dropped
3175  */
3176 int netif_receive_skb(struct sk_buff *skb)
3177 {
3178         if (netdev_tstamp_prequeue)
3179                 net_timestamp_check(skb);
3180
3181         if (skb_defer_rx_timestamp(skb))
3182                 return NET_RX_SUCCESS;
3183
3184 #ifdef CONFIG_RPS
3185         {
3186                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3187                 int cpu, ret;
3188
3189                 rcu_read_lock();
3190
3191                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3192
3193                 if (cpu >= 0) {
3194                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3195                         rcu_read_unlock();
3196                 } else {
3197                         rcu_read_unlock();
3198                         ret = __netif_receive_skb(skb);
3199                 }
3200
3201                 return ret;
3202         }
3203 #else
3204         return __netif_receive_skb(skb);
3205 #endif
3206 }
3207 EXPORT_SYMBOL(netif_receive_skb);
3208
3209 /* Network device is going away, flush any packets still pending
3210  * Called with irqs disabled.
3211  */
3212 static void flush_backlog(void *arg)
3213 {
3214         struct net_device *dev = arg;
3215         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3216         struct sk_buff *skb, *tmp;
3217
3218         rps_lock(sd);
3219         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3220                 if (skb->dev == dev) {
3221                         __skb_unlink(skb, &sd->input_pkt_queue);
3222                         kfree_skb(skb);
3223                         input_queue_head_incr(sd);
3224                 }
3225         }
3226         rps_unlock(sd);
3227
3228         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3229                 if (skb->dev == dev) {
3230                         __skb_unlink(skb, &sd->process_queue);
3231                         kfree_skb(skb);
3232                         input_queue_head_incr(sd);
3233                 }
3234         }
3235 }
3236
3237 static int napi_gro_complete(struct sk_buff *skb)
3238 {
3239         struct packet_type *ptype;
3240         __be16 type = skb->protocol;
3241         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3242         int err = -ENOENT;
3243
3244         if (NAPI_GRO_CB(skb)->count == 1) {
3245                 skb_shinfo(skb)->gso_size = 0;
3246                 goto out;
3247         }
3248
3249         rcu_read_lock();
3250         list_for_each_entry_rcu(ptype, head, list) {
3251                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3252                         continue;
3253
3254                 err = ptype->gro_complete(skb);
3255                 break;
3256         }
3257         rcu_read_unlock();
3258
3259         if (err) {
3260                 WARN_ON(&ptype->list == head);
3261                 kfree_skb(skb);
3262                 return NET_RX_SUCCESS;
3263         }
3264
3265 out:
3266         return netif_receive_skb(skb);
3267 }
3268
3269 inline void napi_gro_flush(struct napi_struct *napi)
3270 {
3271         struct sk_buff *skb, *next;
3272
3273         for (skb = napi->gro_list; skb; skb = next) {
3274                 next = skb->next;
3275                 skb->next = NULL;
3276                 napi_gro_complete(skb);
3277         }
3278
3279         napi->gro_count = 0;
3280         napi->gro_list = NULL;
3281 }
3282 EXPORT_SYMBOL(napi_gro_flush);
3283
3284 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3285 {
3286         struct sk_buff **pp = NULL;
3287         struct packet_type *ptype;
3288         __be16 type = skb->protocol;
3289         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3290         int same_flow;
3291         int mac_len;
3292         enum gro_result ret;
3293
3294         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3295                 goto normal;
3296
3297         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3298                 goto normal;
3299
3300         rcu_read_lock();
3301         list_for_each_entry_rcu(ptype, head, list) {
3302                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3303                         continue;
3304
3305                 skb_set_network_header(skb, skb_gro_offset(skb));
3306                 mac_len = skb->network_header - skb->mac_header;
3307                 skb->mac_len = mac_len;
3308                 NAPI_GRO_CB(skb)->same_flow = 0;
3309                 NAPI_GRO_CB(skb)->flush = 0;
3310                 NAPI_GRO_CB(skb)->free = 0;
3311
3312                 pp = ptype->gro_receive(&napi->gro_list, skb);
3313                 break;
3314         }
3315         rcu_read_unlock();
3316
3317         if (&ptype->list == head)
3318                 goto normal;
3319
3320         same_flow = NAPI_GRO_CB(skb)->same_flow;
3321         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3322
3323         if (pp) {
3324                 struct sk_buff *nskb = *pp;
3325
3326                 *pp = nskb->next;
3327                 nskb->next = NULL;
3328                 napi_gro_complete(nskb);
3329                 napi->gro_count--;
3330         }
3331
3332         if (same_flow)
3333                 goto ok;
3334
3335         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3336                 goto normal;
3337
3338         napi->gro_count++;
3339         NAPI_GRO_CB(skb)->count = 1;
3340         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3341         skb->next = napi->gro_list;
3342         napi->gro_list = skb;
3343         ret = GRO_HELD;
3344
3345 pull:
3346         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3347                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3348
3349                 BUG_ON(skb->end - skb->tail < grow);
3350
3351                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3352
3353                 skb->tail += grow;
3354                 skb->data_len -= grow;
3355
3356                 skb_shinfo(skb)->frags[0].page_offset += grow;
3357                 skb_shinfo(skb)->frags[0].size -= grow;
3358
3359                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3360                         put_page(skb_shinfo(skb)->frags[0].page);
3361                         memmove(skb_shinfo(skb)->frags,
3362                                 skb_shinfo(skb)->frags + 1,
3363                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3364                 }
3365         }
3366
3367 ok:
3368         return ret;
3369
3370 normal:
3371         ret = GRO_NORMAL;
3372         goto pull;
3373 }
3374 EXPORT_SYMBOL(dev_gro_receive);
3375
3376 static inline gro_result_t
3377 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3378 {
3379         struct sk_buff *p;
3380
3381         for (p = napi->gro_list; p; p = p->next) {
3382                 unsigned long diffs;
3383
3384                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3385                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3386                 diffs |= compare_ether_header(skb_mac_header(p),
3387                                               skb_gro_mac_header(skb));
3388                 NAPI_GRO_CB(p)->same_flow = !diffs;
3389                 NAPI_GRO_CB(p)->flush = 0;
3390         }
3391
3392         return dev_gro_receive(napi, skb);
3393 }
3394
3395 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3396 {
3397         switch (ret) {
3398         case GRO_NORMAL:
3399                 if (netif_receive_skb(skb))
3400                         ret = GRO_DROP;
3401                 break;
3402
3403         case GRO_DROP:
3404         case GRO_MERGED_FREE:
3405                 kfree_skb(skb);
3406                 break;
3407
3408         case GRO_HELD:
3409         case GRO_MERGED:
3410                 break;
3411         }
3412
3413         return ret;
3414 }
3415 EXPORT_SYMBOL(napi_skb_finish);
3416
3417 void skb_gro_reset_offset(struct sk_buff *skb)
3418 {
3419         NAPI_GRO_CB(skb)->data_offset = 0;
3420         NAPI_GRO_CB(skb)->frag0 = NULL;
3421         NAPI_GRO_CB(skb)->frag0_len = 0;
3422
3423         if (skb->mac_header == skb->tail &&
3424             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3425                 NAPI_GRO_CB(skb)->frag0 =
3426                         page_address(skb_shinfo(skb)->frags[0].page) +
3427                         skb_shinfo(skb)->frags[0].page_offset;
3428                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3429         }
3430 }
3431 EXPORT_SYMBOL(skb_gro_reset_offset);
3432
3433 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3434 {
3435         skb_gro_reset_offset(skb);
3436
3437         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3438 }
3439 EXPORT_SYMBOL(napi_gro_receive);
3440
3441 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3442 {
3443         __skb_pull(skb, skb_headlen(skb));
3444         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3445         skb->vlan_tci = 0;
3446
3447         napi->skb = skb;
3448 }
3449
3450 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3451 {
3452         struct sk_buff *skb = napi->skb;
3453
3454         if (!skb) {
3455                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3456                 if (skb)
3457                         napi->skb = skb;
3458         }
3459         return skb;
3460 }
3461 EXPORT_SYMBOL(napi_get_frags);
3462
3463 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3464                                gro_result_t ret)
3465 {
3466         switch (ret) {
3467         case GRO_NORMAL:
3468         case GRO_HELD:
3469                 skb->protocol = eth_type_trans(skb, skb->dev);
3470
3471                 if (ret == GRO_HELD)
3472                         skb_gro_pull(skb, -ETH_HLEN);
3473                 else if (netif_receive_skb(skb))
3474                         ret = GRO_DROP;
3475                 break;
3476
3477         case GRO_DROP:
3478         case GRO_MERGED_FREE:
3479                 napi_reuse_skb(napi, skb);
3480                 break;
3481
3482         case GRO_MERGED:
3483                 break;
3484         }
3485
3486         return ret;
3487 }
3488 EXPORT_SYMBOL(napi_frags_finish);
3489
3490 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3491 {
3492         struct sk_buff *skb = napi->skb;
3493         struct ethhdr *eth;
3494         unsigned int hlen;
3495         unsigned int off;
3496
3497         napi->skb = NULL;
3498
3499         skb_reset_mac_header(skb);
3500         skb_gro_reset_offset(skb);
3501
3502         off = skb_gro_offset(skb);
3503         hlen = off + sizeof(*eth);
3504         eth = skb_gro_header_fast(skb, off);
3505         if (skb_gro_header_hard(skb, hlen)) {
3506                 eth = skb_gro_header_slow(skb, hlen, off);
3507                 if (unlikely(!eth)) {
3508                         napi_reuse_skb(napi, skb);
3509                         skb = NULL;
3510                         goto out;
3511                 }
3512         }
3513
3514         skb_gro_pull(skb, sizeof(*eth));
3515
3516         /*
3517          * This works because the only protocols we care about don't require
3518          * special handling.  We'll fix it up properly at the end.
3519          */
3520         skb->protocol = eth->h_proto;
3521
3522 out:
3523         return skb;
3524 }
3525 EXPORT_SYMBOL(napi_frags_skb);
3526
3527 gro_result_t napi_gro_frags(struct napi_struct *napi)
3528 {
3529         struct sk_buff *skb = napi_frags_skb(napi);
3530
3531         if (!skb)
3532                 return GRO_DROP;
3533
3534         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3535 }
3536 EXPORT_SYMBOL(napi_gro_frags);
3537
3538 /*
3539  * net_rps_action sends any pending IPI's for rps.
3540  * Note: called with local irq disabled, but exits with local irq enabled.
3541  */
3542 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3543 {
3544 #ifdef CONFIG_RPS
3545         struct softnet_data *remsd = sd->rps_ipi_list;
3546
3547         if (remsd) {
3548                 sd->rps_ipi_list = NULL;
3549
3550                 local_irq_enable();
3551
3552                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3553                 while (remsd) {
3554                         struct softnet_data *next = remsd->rps_ipi_next;
3555
3556                         if (cpu_online(remsd->cpu))
3557                                 __smp_call_function_single(remsd->cpu,
3558                                                            &remsd->csd, 0);
3559                         remsd = next;
3560                 }
3561         } else
3562 #endif
3563                 local_irq_enable();
3564 }
3565
3566 static int process_backlog(struct napi_struct *napi, int quota)
3567 {
3568         int work = 0;
3569         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3570
3571 #ifdef CONFIG_RPS
3572         /* Check if we have pending ipi, its better to send them now,
3573          * not waiting net_rx_action() end.
3574          */
3575         if (sd->rps_ipi_list) {
3576                 local_irq_disable();
3577                 net_rps_action_and_irq_enable(sd);
3578         }
3579 #endif
3580         napi->weight = weight_p;
3581         local_irq_disable();
3582         while (work < quota) {
3583                 struct sk_buff *skb;
3584                 unsigned int qlen;
3585
3586                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3587                         local_irq_enable();
3588                         __netif_receive_skb(skb);
3589                         local_irq_disable();
3590                         input_queue_head_incr(sd);
3591                         if (++work >= quota) {
3592                                 local_irq_enable();
3593                                 return work;
3594                         }
3595                 }
3596
3597                 rps_lock(sd);
3598                 qlen = skb_queue_len(&sd->input_pkt_queue);
3599                 if (qlen)
3600                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3601                                                    &sd->process_queue);
3602
3603                 if (qlen < quota - work) {
3604                         /*
3605                          * Inline a custom version of __napi_complete().
3606                          * only current cpu owns and manipulates this napi,
3607                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3608                          * we can use a plain write instead of clear_bit(),
3609                          * and we dont need an smp_mb() memory barrier.
3610                          */
3611                         list_del(&napi->poll_list);
3612                         napi->state = 0;
3613
3614                         quota = work + qlen;
3615                 }
3616                 rps_unlock(sd);
3617         }
3618         local_irq_enable();
3619
3620         return work;
3621 }
3622
3623 /**
3624  * __napi_schedule - schedule for receive
3625  * @n: entry to schedule
3626  *
3627  * The entry's receive function will be scheduled to run
3628  */
3629 void __napi_schedule(struct napi_struct *n)
3630 {
3631         unsigned long flags;
3632
3633         local_irq_save(flags);
3634         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3635         local_irq_restore(flags);
3636 }
3637 EXPORT_SYMBOL(__napi_schedule);
3638
3639 void __napi_complete(struct napi_struct *n)
3640 {
3641         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3642         BUG_ON(n->gro_list);
3643
3644         list_del(&n->poll_list);
3645         smp_mb__before_clear_bit();
3646         clear_bit(NAPI_STATE_SCHED, &n->state);
3647 }
3648 EXPORT_SYMBOL(__napi_complete);
3649
3650 void napi_complete(struct napi_struct *n)
3651 {
3652         unsigned long flags;
3653
3654         /*
3655          * don't let napi dequeue from the cpu poll list
3656          * just in case its running on a different cpu
3657          */
3658         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3659                 return;
3660
3661         napi_gro_flush(n);
3662         local_irq_save(flags);
3663         __napi_complete(n);
3664         local_irq_restore(flags);
3665 }
3666 EXPORT_SYMBOL(napi_complete);
3667
3668 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3669                     int (*poll)(struct napi_struct *, int), int weight)
3670 {
3671         INIT_LIST_HEAD(&napi->poll_list);
3672         napi->gro_count = 0;
3673         napi->gro_list = NULL;
3674         napi->skb = NULL;
3675         napi->poll = poll;
3676         napi->weight = weight;
3677         list_add(&napi->dev_list, &dev->napi_list);
3678         napi->dev = dev;
3679 #ifdef CONFIG_NETPOLL
3680         spin_lock_init(&napi->poll_lock);
3681         napi->poll_owner = -1;
3682 #endif
3683         set_bit(NAPI_STATE_SCHED, &napi->state);
3684 }
3685 EXPORT_SYMBOL(netif_napi_add);
3686
3687 void netif_napi_del(struct napi_struct *napi)
3688 {
3689         struct sk_buff *skb, *next;
3690
3691         list_del_init(&napi->dev_list);
3692         napi_free_frags(napi);
3693
3694         for (skb = napi->gro_list; skb; skb = next) {
3695                 next = skb->next;
3696                 skb->next = NULL;
3697                 kfree_skb(skb);
3698         }
3699
3700         napi->gro_list = NULL;
3701         napi->gro_count = 0;
3702 }
3703 EXPORT_SYMBOL(netif_napi_del);
3704
3705 static void net_rx_action(struct softirq_action *h)
3706 {
3707         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3708         unsigned long time_limit = jiffies + 2;
3709         int budget = netdev_budget;
3710         void *have;
3711
3712         local_irq_disable();
3713
3714         while (!list_empty(&sd->poll_list)) {
3715                 struct napi_struct *n;
3716                 int work, weight;
3717
3718                 /* If softirq window is exhuasted then punt.
3719                  * Allow this to run for 2 jiffies since which will allow
3720                  * an average latency of 1.5/HZ.
3721                  */
3722                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3723                         goto softnet_break;
3724
3725                 local_irq_enable();
3726
3727                 /* Even though interrupts have been re-enabled, this
3728                  * access is safe because interrupts can only add new
3729                  * entries to the tail of this list, and only ->poll()
3730                  * calls can remove this head entry from the list.
3731                  */
3732                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3733
3734                 have = netpoll_poll_lock(n);
3735
3736                 weight = n->weight;
3737
3738                 /* This NAPI_STATE_SCHED test is for avoiding a race
3739                  * with netpoll's poll_napi().  Only the entity which
3740                  * obtains the lock and sees NAPI_STATE_SCHED set will
3741                  * actually make the ->poll() call.  Therefore we avoid
3742                  * accidently calling ->poll() when NAPI is not scheduled.
3743                  */
3744                 work = 0;
3745                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3746                         work = n->poll(n, weight);
3747                         trace_napi_poll(n);
3748                 }
3749
3750                 WARN_ON_ONCE(work > weight);
3751
3752                 budget -= work;
3753
3754                 local_irq_disable();
3755
3756                 /* Drivers must not modify the NAPI state if they
3757                  * consume the entire weight.  In such cases this code
3758                  * still "owns" the NAPI instance and therefore can
3759                  * move the instance around on the list at-will.
3760                  */
3761                 if (unlikely(work == weight)) {
3762                         if (unlikely(napi_disable_pending(n))) {
3763                                 local_irq_enable();
3764                                 napi_complete(n);
3765                                 local_irq_disable();
3766                         } else
3767                                 list_move_tail(&n->poll_list, &sd->poll_list);
3768                 }
3769
3770                 netpoll_poll_unlock(have);
3771         }
3772 out:
3773         net_rps_action_and_irq_enable(sd);
3774
3775 #ifdef CONFIG_NET_DMA
3776         /*
3777          * There may not be any more sk_buffs coming right now, so push
3778          * any pending DMA copies to hardware
3779          */
3780         dma_issue_pending_all();
3781 #endif
3782
3783         return;
3784
3785 softnet_break:
3786         sd->time_squeeze++;
3787         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3788         goto out;
3789 }
3790
3791 static gifconf_func_t *gifconf_list[NPROTO];
3792
3793 /**
3794  *      register_gifconf        -       register a SIOCGIF handler
3795  *      @family: Address family
3796  *      @gifconf: Function handler
3797  *
3798  *      Register protocol dependent address dumping routines. The handler
3799  *      that is passed must not be freed or reused until it has been replaced
3800  *      by another handler.
3801  */
3802 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3803 {
3804         if (family >= NPROTO)
3805                 return -EINVAL;
3806         gifconf_list[family] = gifconf;
3807         return 0;
3808 }
3809 EXPORT_SYMBOL(register_gifconf);
3810
3811
3812 /*
3813  *      Map an interface index to its name (SIOCGIFNAME)
3814  */
3815
3816 /*
3817  *      We need this ioctl for efficient implementation of the
3818  *      if_indextoname() function required by the IPv6 API.  Without
3819  *      it, we would have to search all the interfaces to find a
3820  *      match.  --pb
3821  */
3822
3823 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3824 {
3825         struct net_device *dev;
3826         struct ifreq ifr;
3827
3828         /*
3829          *      Fetch the caller's info block.
3830          */
3831
3832         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3833                 return -EFAULT;
3834
3835         rcu_read_lock();
3836         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3837         if (!dev) {
3838                 rcu_read_unlock();
3839                 return -ENODEV;
3840         }
3841
3842         strcpy(ifr.ifr_name, dev->name);
3843         rcu_read_unlock();
3844
3845         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3846                 return -EFAULT;
3847         return 0;
3848 }
3849
3850 /*
3851  *      Perform a SIOCGIFCONF call. This structure will change
3852  *      size eventually, and there is nothing I can do about it.
3853  *      Thus we will need a 'compatibility mode'.
3854  */
3855
3856 static int dev_ifconf(struct net *net, char __user *arg)
3857 {
3858         struct ifconf ifc;
3859         struct net_device *dev;
3860         char __user *pos;
3861         int len;
3862         int total;
3863         int i;
3864
3865         /*
3866          *      Fetch the caller's info block.
3867          */
3868
3869         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3870                 return -EFAULT;
3871
3872         pos = ifc.ifc_buf;
3873         len = ifc.ifc_len;
3874
3875         /*
3876          *      Loop over the interfaces, and write an info block for each.
3877          */
3878
3879         total = 0;
3880         for_each_netdev(net, dev) {
3881                 for (i = 0; i < NPROTO; i++) {
3882                         if (gifconf_list[i]) {
3883                                 int done;
3884                                 if (!pos)
3885                                         done = gifconf_list[i](dev, NULL, 0);
3886                                 else
3887                                         done = gifconf_list[i](dev, pos + total,
3888                                                                len - total);
3889                                 if (done < 0)
3890                                         return -EFAULT;
3891                                 total += done;
3892                         }
3893                 }
3894         }
3895
3896         /*
3897          *      All done.  Write the updated control block back to the caller.
3898          */
3899         ifc.ifc_len = total;
3900
3901         /*
3902          *      Both BSD and Solaris return 0 here, so we do too.
3903          */
3904         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3905 }
3906
3907 #ifdef CONFIG_PROC_FS
3908 /*
3909  *      This is invoked by the /proc filesystem handler to display a device
3910  *      in detail.
3911  */
3912 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3913         __acquires(RCU)
3914 {
3915         struct net *net = seq_file_net(seq);
3916         loff_t off;
3917         struct net_device *dev;
3918
3919         rcu_read_lock();
3920         if (!*pos)
3921                 return SEQ_START_TOKEN;
3922
3923         off = 1;
3924         for_each_netdev_rcu(net, dev)
3925                 if (off++ == *pos)
3926                         return dev;
3927
3928         return NULL;
3929 }
3930
3931 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3932 {
3933         struct net_device *dev = (v == SEQ_START_TOKEN) ?
3934                                   first_net_device(seq_file_net(seq)) :
3935                                   next_net_device((struct net_device *)v);
3936
3937         ++*pos;
3938         return rcu_dereference(dev);
3939 }
3940
3941 void dev_seq_stop(struct seq_file *seq, void *v)
3942         __releases(RCU)
3943 {
3944         rcu_read_unlock();
3945 }
3946
3947 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3948 {
3949         struct rtnl_link_stats64 temp;
3950         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3951
3952         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3953                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3954                    dev->name, stats->rx_bytes, stats->rx_packets,
3955                    stats->rx_errors,
3956                    stats->rx_dropped + stats->rx_missed_errors,
3957                    stats->rx_fifo_errors,
3958                    stats->rx_length_errors + stats->rx_over_errors +
3959                     stats->rx_crc_errors + stats->rx_frame_errors,
3960                    stats->rx_compressed, stats->multicast,
3961                    stats->tx_bytes, stats->tx_packets,
3962                    stats->tx_errors, stats->tx_dropped,
3963                    stats->tx_fifo_errors, stats->collisions,
3964                    stats->tx_carrier_errors +
3965                     stats->tx_aborted_errors +
3966                     stats->tx_window_errors +
3967                     stats->tx_heartbeat_errors,
3968                    stats->tx_compressed);
3969 }
3970
3971 /*
3972  *      Called from the PROCfs module. This now uses the new arbitrary sized
3973  *      /proc/net interface to create /proc/net/dev
3974  */
3975 static int dev_seq_show(struct seq_file *seq, void *v)
3976 {
3977         if (v == SEQ_START_TOKEN)
3978                 seq_puts(seq, "Inter-|   Receive                            "
3979                               "                    |  Transmit\n"
3980                               " face |bytes    packets errs drop fifo frame "
3981                               "compressed multicast|bytes    packets errs "
3982                               "drop fifo colls carrier compressed\n");
3983         else
3984                 dev_seq_printf_stats(seq, v);
3985         return 0;
3986 }
3987
3988 static struct softnet_data *softnet_get_online(loff_t *pos)
3989 {
3990         struct softnet_data *sd = NULL;
3991
3992         while (*pos < nr_cpu_ids)
3993                 if (cpu_online(*pos)) {
3994                         sd = &per_cpu(softnet_data, *pos);
3995                         break;
3996                 } else
3997                         ++*pos;
3998         return sd;
3999 }
4000
4001 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4002 {
4003         return softnet_get_online(pos);
4004 }
4005
4006 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4007 {
4008         ++*pos;
4009         return softnet_get_online(pos);
4010 }
4011
4012 static void softnet_seq_stop(struct seq_file *seq, void *v)
4013 {
4014 }
4015
4016 static int softnet_seq_show(struct seq_file *seq, void *v)
4017 {
4018         struct softnet_data *sd = v;
4019
4020         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4021                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4022                    0, 0, 0, 0, /* was fastroute */
4023                    sd->cpu_collision, sd->received_rps);
4024         return 0;
4025 }
4026
4027 static const struct seq_operations dev_seq_ops = {
4028         .start = dev_seq_start,
4029         .next  = dev_seq_next,
4030         .stop  = dev_seq_stop,
4031         .show  = dev_seq_show,
4032 };
4033
4034 static int dev_seq_open(struct inode *inode, struct file *file)
4035 {
4036         return seq_open_net(inode, file, &dev_seq_ops,
4037                             sizeof(struct seq_net_private));
4038 }
4039
4040 static const struct file_operations dev_seq_fops = {
4041         .owner   = THIS_MODULE,
4042         .open    = dev_seq_open,
4043         .read    = seq_read,
4044         .llseek  = seq_lseek,
4045         .release = seq_release_net,
4046 };
4047
4048 static const struct seq_operations softnet_seq_ops = {
4049         .start = softnet_seq_start,
4050         .next  = softnet_seq_next,
4051         .stop  = softnet_seq_stop,
4052         .show  = softnet_seq_show,
4053 };
4054
4055 static int softnet_seq_open(struct inode *inode, struct file *file)
4056 {
4057         return seq_open(file, &softnet_seq_ops);
4058 }
4059
4060 static const struct file_operations softnet_seq_fops = {
4061         .owner   = THIS_MODULE,
4062         .open    = softnet_seq_open,
4063         .read    = seq_read,
4064         .llseek  = seq_lseek,
4065         .release = seq_release,
4066 };
4067
4068 static void *ptype_get_idx(loff_t pos)
4069 {
4070         struct packet_type *pt = NULL;
4071         loff_t i = 0;
4072         int t;
4073
4074         list_for_each_entry_rcu(pt, &ptype_all, list) {
4075                 if (i == pos)
4076                         return pt;
4077                 ++i;
4078         }
4079
4080         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4081                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4082                         if (i == pos)
4083                                 return pt;
4084                         ++i;
4085                 }
4086         }
4087         return NULL;
4088 }
4089
4090 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4091         __acquires(RCU)
4092 {
4093         rcu_read_lock();
4094         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4095 }
4096
4097 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4098 {
4099         struct packet_type *pt;
4100         struct list_head *nxt;
4101         int hash;
4102
4103         ++*pos;
4104         if (v == SEQ_START_TOKEN)
4105                 return ptype_get_idx(0);
4106
4107         pt = v;
4108         nxt = pt->list.next;
4109         if (pt->type == htons(ETH_P_ALL)) {
4110                 if (nxt != &ptype_all)
4111                         goto found;
4112                 hash = 0;
4113                 nxt = ptype_base[0].next;
4114         } else
4115                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4116
4117         while (nxt == &ptype_base[hash]) {
4118                 if (++hash >= PTYPE_HASH_SIZE)
4119                         return NULL;
4120                 nxt = ptype_base[hash].next;
4121         }
4122 found:
4123         return list_entry(nxt, struct packet_type, list);
4124 }
4125
4126 static void ptype_seq_stop(struct seq_file *seq, void *v)
4127         __releases(RCU)
4128 {
4129         rcu_read_unlock();
4130 }
4131
4132 static int ptype_seq_show(struct seq_file *seq, void *v)
4133 {
4134         struct packet_type *pt = v;
4135
4136         if (v == SEQ_START_TOKEN)
4137                 seq_puts(seq, "Type Device      Function\n");
4138         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4139                 if (pt->type == htons(ETH_P_ALL))
4140                         seq_puts(seq, "ALL ");
4141                 else
4142                         seq_printf(seq, "%04x", ntohs(pt->type));
4143
4144                 seq_printf(seq, " %-8s %pF\n",
4145                            pt->dev ? pt->dev->name : "", pt->func);
4146         }
4147
4148         return 0;
4149 }
4150
4151 static const struct seq_operations ptype_seq_ops = {
4152         .start = ptype_seq_start,
4153         .next  = ptype_seq_next,
4154         .stop  = ptype_seq_stop,
4155         .show  = ptype_seq_show,
4156 };
4157
4158 static int ptype_seq_open(struct inode *inode, struct file *file)
4159 {
4160         return seq_open_net(inode, file, &ptype_seq_ops,
4161                         sizeof(struct seq_net_private));
4162 }
4163
4164 static const struct file_operations ptype_seq_fops = {
4165         .owner   = THIS_MODULE,
4166         .open    = ptype_seq_open,
4167         .read    = seq_read,
4168         .llseek  = seq_lseek,
4169         .release = seq_release_net,
4170 };
4171
4172
4173 static int __net_init dev_proc_net_init(struct net *net)
4174 {
4175         int rc = -ENOMEM;
4176
4177         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4178                 goto out;
4179         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4180                 goto out_dev;
4181         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4182                 goto out_softnet;
4183
4184         if (wext_proc_init(net))
4185                 goto out_ptype;
4186         rc = 0;
4187 out:
4188         return rc;
4189 out_ptype:
4190         proc_net_remove(net, "ptype");
4191 out_softnet:
4192         proc_net_remove(net, "softnet_stat");
4193 out_dev:
4194         proc_net_remove(net, "dev");
4195         goto out;
4196 }
4197
4198 static void __net_exit dev_proc_net_exit(struct net *net)
4199 {
4200         wext_proc_exit(net);
4201
4202         proc_net_remove(net, "ptype");
4203         proc_net_remove(net, "softnet_stat");
4204         proc_net_remove(net, "dev");
4205 }
4206
4207 static struct pernet_operations __net_initdata dev_proc_ops = {
4208         .init = dev_proc_net_init,
4209         .exit = dev_proc_net_exit,
4210 };
4211
4212 static int __init dev_proc_init(void)
4213 {
4214         return register_pernet_subsys(&dev_proc_ops);
4215 }
4216 #else
4217 #define dev_proc_init() 0
4218 #endif  /* CONFIG_PROC_FS */
4219
4220
4221 /**
4222  *      netdev_set_master       -       set up master/slave pair
4223  *      @slave: slave device
4224  *      @master: new master device
4225  *
4226  *      Changes the master device of the slave. Pass %NULL to break the
4227  *      bonding. The caller must hold the RTNL semaphore. On a failure
4228  *      a negative errno code is returned. On success the reference counts
4229  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4230  *      function returns zero.
4231  */
4232 int netdev_set_master(struct net_device *slave, struct net_device *master)
4233 {
4234         struct net_device *old = slave->master;
4235
4236         ASSERT_RTNL();
4237
4238         if (master) {
4239                 if (old)
4240                         return -EBUSY;
4241                 dev_hold(master);
4242         }
4243
4244         slave->master = master;
4245
4246         if (old) {
4247                 synchronize_net();
4248                 dev_put(old);
4249         }
4250         if (master)
4251                 slave->flags |= IFF_SLAVE;
4252         else
4253                 slave->flags &= ~IFF_SLAVE;
4254
4255         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4256         return 0;
4257 }
4258 EXPORT_SYMBOL(netdev_set_master);
4259
4260 static void dev_change_rx_flags(struct net_device *dev, int flags)
4261 {
4262         const struct net_device_ops *ops = dev->netdev_ops;
4263
4264         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4265                 ops->ndo_change_rx_flags(dev, flags);
4266 }
4267
4268 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4269 {
4270         unsigned short old_flags = dev->flags;
4271         uid_t uid;
4272         gid_t gid;
4273
4274         ASSERT_RTNL();
4275
4276         dev->flags |= IFF_PROMISC;
4277         dev->promiscuity += inc;
4278         if (dev->promiscuity == 0) {
4279                 /*
4280                  * Avoid overflow.
4281                  * If inc causes overflow, untouch promisc and return error.
4282                  */
4283                 if (inc < 0)
4284                         dev->flags &= ~IFF_PROMISC;
4285                 else {
4286                         dev->promiscuity -= inc;
4287                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4288                                 "set promiscuity failed, promiscuity feature "
4289                                 "of device might be broken.\n", dev->name);
4290                         return -EOVERFLOW;
4291                 }
4292         }
4293         if (dev->flags != old_flags) {
4294                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4295                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4296                                                                "left");
4297                 if (audit_enabled) {
4298                         current_uid_gid(&uid, &gid);
4299                         audit_log(current->audit_context, GFP_ATOMIC,
4300                                 AUDIT_ANOM_PROMISCUOUS,
4301                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4302                                 dev->name, (dev->flags & IFF_PROMISC),
4303                                 (old_flags & IFF_PROMISC),
4304                                 audit_get_loginuid(current),
4305                                 uid, gid,
4306                                 audit_get_sessionid(current));
4307                 }
4308
4309                 dev_change_rx_flags(dev, IFF_PROMISC);
4310         }
4311         return 0;
4312 }
4313
4314 /**
4315  *      dev_set_promiscuity     - update promiscuity count on a device
4316  *      @dev: device
4317  *      @inc: modifier
4318  *
4319  *      Add or remove promiscuity from a device. While the count in the device
4320  *      remains above zero the interface remains promiscuous. Once it hits zero
4321  *      the device reverts back to normal filtering operation. A negative inc
4322  *      value is used to drop promiscuity on the device.
4323  *      Return 0 if successful or a negative errno code on error.
4324  */
4325 int dev_set_promiscuity(struct net_device *dev, int inc)
4326 {
4327         unsigned short old_flags = dev->flags;
4328         int err;
4329
4330         err = __dev_set_promiscuity(dev, inc);
4331         if (err < 0)
4332                 return err;
4333         if (dev->flags != old_flags)
4334                 dev_set_rx_mode(dev);
4335         return err;
4336 }
4337 EXPORT_SYMBOL(dev_set_promiscuity);
4338
4339 /**
4340  *      dev_set_allmulti        - update allmulti count on a device
4341  *      @dev: device
4342  *      @inc: modifier
4343  *
4344  *      Add or remove reception of all multicast frames to a device. While the
4345  *      count in the device remains above zero the interface remains listening
4346  *      to all interfaces. Once it hits zero the device reverts back to normal
4347  *      filtering operation. A negative @inc value is used to drop the counter
4348  *      when releasing a resource needing all multicasts.
4349  *      Return 0 if successful or a negative errno code on error.
4350  */
4351
4352 int dev_set_allmulti(struct net_device *dev, int inc)
4353 {
4354         unsigned short old_flags = dev->flags;
4355
4356         ASSERT_RTNL();
4357
4358         dev->flags |= IFF_ALLMULTI;
4359         dev->allmulti += inc;
4360         if (dev->allmulti == 0) {
4361                 /*
4362                  * Avoid overflow.
4363                  * If inc causes overflow, untouch allmulti and return error.
4364                  */
4365                 if (inc < 0)
4366                         dev->flags &= ~IFF_ALLMULTI;
4367                 else {
4368                         dev->allmulti -= inc;
4369                         printk(KERN_WARNING "%s: allmulti touches roof, "
4370                                 "set allmulti failed, allmulti feature of "
4371                                 "device might be broken.\n", dev->name);
4372                         return -EOVERFLOW;
4373                 }
4374         }
4375         if (dev->flags ^ old_flags) {
4376                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4377                 dev_set_rx_mode(dev);
4378         }
4379         return 0;
4380 }
4381 EXPORT_SYMBOL(dev_set_allmulti);
4382
4383 /*
4384  *      Upload unicast and multicast address lists to device and
4385  *      configure RX filtering. When the device doesn't support unicast
4386  *      filtering it is put in promiscuous mode while unicast addresses
4387  *      are present.
4388  */
4389 void __dev_set_rx_mode(struct net_device *dev)
4390 {
4391         const struct net_device_ops *ops = dev->netdev_ops;
4392
4393         /* dev_open will call this function so the list will stay sane. */
4394         if (!(dev->flags&IFF_UP))
4395                 return;
4396
4397         if (!netif_device_present(dev))
4398                 return;
4399
4400         if (ops->ndo_set_rx_mode)
4401                 ops->ndo_set_rx_mode(dev);
4402         else {
4403                 /* Unicast addresses changes may only happen under the rtnl,
4404                  * therefore calling __dev_set_promiscuity here is safe.
4405                  */
4406                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4407                         __dev_set_promiscuity(dev, 1);
4408                         dev->uc_promisc = 1;
4409                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4410                         __dev_set_promiscuity(dev, -1);
4411                         dev->uc_promisc = 0;
4412                 }
4413
4414                 if (ops->ndo_set_multicast_list)
4415                         ops->ndo_set_multicast_list(dev);
4416         }
4417 }
4418
4419 void dev_set_rx_mode(struct net_device *dev)
4420 {
4421         netif_addr_lock_bh(dev);
4422         __dev_set_rx_mode(dev);
4423         netif_addr_unlock_bh(dev);
4424 }
4425
4426 /**
4427  *      dev_get_flags - get flags reported to userspace
4428  *      @dev: device
4429  *
4430  *      Get the combination of flag bits exported through APIs to userspace.
4431  */
4432 unsigned dev_get_flags(const struct net_device *dev)
4433 {
4434         unsigned flags;
4435
4436         flags = (dev->flags & ~(IFF_PROMISC |
4437                                 IFF_ALLMULTI |
4438                                 IFF_RUNNING |
4439                                 IFF_LOWER_UP |
4440                                 IFF_DORMANT)) |
4441                 (dev->gflags & (IFF_PROMISC |
4442                                 IFF_ALLMULTI));
4443
4444         if (netif_running(dev)) {
4445                 if (netif_oper_up(dev))
4446                         flags |= IFF_RUNNING;
4447                 if (netif_carrier_ok(dev))
4448                         flags |= IFF_LOWER_UP;
4449                 if (netif_dormant(dev))
4450                         flags |= IFF_DORMANT;
4451         }
4452
4453         return flags;
4454 }
4455 EXPORT_SYMBOL(dev_get_flags);
4456
4457 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4458 {
4459         int old_flags = dev->flags;
4460         int ret;
4461
4462         ASSERT_RTNL();
4463
4464         /*
4465          *      Set the flags on our device.
4466          */
4467
4468         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4469                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4470                                IFF_AUTOMEDIA)) |
4471                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4472                                     IFF_ALLMULTI));
4473
4474         /*
4475          *      Load in the correct multicast list now the flags have changed.
4476          */
4477
4478         if ((old_flags ^ flags) & IFF_MULTICAST)
4479                 dev_change_rx_flags(dev, IFF_MULTICAST);
4480
4481         dev_set_rx_mode(dev);
4482
4483         /*
4484          *      Have we downed the interface. We handle IFF_UP ourselves
4485          *      according to user attempts to set it, rather than blindly
4486          *      setting it.
4487          */
4488
4489         ret = 0;
4490         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4491                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4492
4493                 if (!ret)
4494                         dev_set_rx_mode(dev);
4495         }
4496
4497         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4498                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4499
4500                 dev->gflags ^= IFF_PROMISC;
4501                 dev_set_promiscuity(dev, inc);
4502         }
4503
4504         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4505            is important. Some (broken) drivers set IFF_PROMISC, when
4506            IFF_ALLMULTI is requested not asking us and not reporting.
4507          */
4508         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4509                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4510
4511                 dev->gflags ^= IFF_ALLMULTI;
4512                 dev_set_allmulti(dev, inc);
4513         }
4514
4515         return ret;
4516 }
4517
4518 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4519 {
4520         unsigned int changes = dev->flags ^ old_flags;
4521
4522         if (changes & IFF_UP) {
4523                 if (dev->flags & IFF_UP)
4524                         call_netdevice_notifiers(NETDEV_UP, dev);
4525                 else
4526                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4527         }
4528
4529         if (dev->flags & IFF_UP &&
4530             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4531                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4532 }
4533
4534 /**
4535  *      dev_change_flags - change device settings
4536  *      @dev: device
4537  *      @flags: device state flags
4538  *
4539  *      Change settings on device based state flags. The flags are
4540  *      in the userspace exported format.
4541  */
4542 int dev_change_flags(struct net_device *dev, unsigned flags)
4543 {
4544         int ret, changes;
4545         int old_flags = dev->flags;
4546
4547         ret = __dev_change_flags(dev, flags);
4548         if (ret < 0)
4549                 return ret;
4550
4551         changes = old_flags ^ dev->flags;
4552         if (changes)
4553                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4554
4555         __dev_notify_flags(dev, old_flags);
4556         return ret;
4557 }
4558 EXPORT_SYMBOL(dev_change_flags);
4559
4560 /**
4561  *      dev_set_mtu - Change maximum transfer unit
4562  *      @dev: device
4563  *      @new_mtu: new transfer unit
4564  *
4565  *      Change the maximum transfer size of the network device.
4566  */
4567 int dev_set_mtu(struct net_device *dev, int new_mtu)
4568 {
4569         const struct net_device_ops *ops = dev->netdev_ops;
4570         int err;
4571
4572         if (new_mtu == dev->mtu)
4573                 return 0;
4574
4575         /*      MTU must be positive.    */
4576         if (new_mtu < 0)
4577                 return -EINVAL;
4578
4579         if (!netif_device_present(dev))
4580                 return -ENODEV;
4581
4582         err = 0;
4583         if (ops->ndo_change_mtu)
4584                 err = ops->ndo_change_mtu(dev, new_mtu);
4585         else
4586                 dev->mtu = new_mtu;
4587
4588         if (!err && dev->flags & IFF_UP)
4589                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4590         return err;
4591 }
4592 EXPORT_SYMBOL(dev_set_mtu);
4593
4594 /**
4595  *      dev_set_mac_address - Change Media Access Control Address
4596  *      @dev: device
4597  *      @sa: new address
4598  *
4599  *      Change the hardware (MAC) address of the device
4600  */
4601 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4602 {
4603         const struct net_device_ops *ops = dev->netdev_ops;
4604         int err;
4605
4606         if (!ops->ndo_set_mac_address)
4607                 return -EOPNOTSUPP;
4608         if (sa->sa_family != dev->type)
4609                 return -EINVAL;
4610         if (!netif_device_present(dev))
4611                 return -ENODEV;
4612         err = ops->ndo_set_mac_address(dev, sa);
4613         if (!err)
4614                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4615         return err;
4616 }
4617 EXPORT_SYMBOL(dev_set_mac_address);
4618
4619 /*
4620  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4621  */
4622 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4623 {
4624         int err;
4625         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4626
4627         if (!dev)
4628                 return -ENODEV;
4629
4630         switch (cmd) {
4631         case SIOCGIFFLAGS:      /* Get interface flags */
4632                 ifr->ifr_flags = (short) dev_get_flags(dev);
4633                 return 0;
4634
4635         case SIOCGIFMETRIC:     /* Get the metric on the interface
4636                                    (currently unused) */
4637                 ifr->ifr_metric = 0;
4638                 return 0;
4639
4640         case SIOCGIFMTU:        /* Get the MTU of a device */
4641                 ifr->ifr_mtu = dev->mtu;
4642                 return 0;
4643
4644         case SIOCGIFHWADDR:
4645                 if (!dev->addr_len)
4646                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4647                 else
4648                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4649                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4650                 ifr->ifr_hwaddr.sa_family = dev->type;
4651                 return 0;
4652
4653         case SIOCGIFSLAVE:
4654                 err = -EINVAL;
4655                 break;
4656
4657         case SIOCGIFMAP:
4658                 ifr->ifr_map.mem_start = dev->mem_start;
4659                 ifr->ifr_map.mem_end   = dev->mem_end;
4660                 ifr->ifr_map.base_addr = dev->base_addr;
4661                 ifr->ifr_map.irq       = dev->irq;
4662                 ifr->ifr_map.dma       = dev->dma;
4663                 ifr->ifr_map.port      = dev->if_port;
4664                 return 0;
4665
4666         case SIOCGIFINDEX:
4667                 ifr->ifr_ifindex = dev->ifindex;
4668                 return 0;
4669
4670         case SIOCGIFTXQLEN:
4671                 ifr->ifr_qlen = dev->tx_queue_len;
4672                 return 0;
4673
4674         default:
4675                 /* dev_ioctl() should ensure this case
4676                  * is never reached
4677                  */
4678                 WARN_ON(1);
4679                 err = -EINVAL;
4680                 break;
4681
4682         }
4683         return err;
4684 }
4685
4686 /*
4687  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4688  */
4689 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4690 {
4691         int err;
4692         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4693         const struct net_device_ops *ops;
4694
4695         if (!dev)
4696                 return -ENODEV;
4697
4698         ops = dev->netdev_ops;
4699
4700         switch (cmd) {
4701         case SIOCSIFFLAGS:      /* Set interface flags */
4702                 return dev_change_flags(dev, ifr->ifr_flags);
4703
4704         case SIOCSIFMETRIC:     /* Set the metric on the interface
4705                                    (currently unused) */
4706                 return -EOPNOTSUPP;
4707
4708         case SIOCSIFMTU:        /* Set the MTU of a device */
4709                 return dev_set_mtu(dev, ifr->ifr_mtu);
4710
4711         case SIOCSIFHWADDR:
4712                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4713
4714         case SIOCSIFHWBROADCAST:
4715                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4716                         return -EINVAL;
4717                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4718                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4719                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4720                 return 0;
4721
4722         case SIOCSIFMAP:
4723                 if (ops->ndo_set_config) {
4724                         if (!netif_device_present(dev))
4725                                 return -ENODEV;
4726                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4727                 }
4728                 return -EOPNOTSUPP;
4729
4730         case SIOCADDMULTI:
4731                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4732                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4733                         return -EINVAL;
4734                 if (!netif_device_present(dev))
4735                         return -ENODEV;
4736                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4737
4738         case SIOCDELMULTI:
4739                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4740                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4741                         return -EINVAL;
4742                 if (!netif_device_present(dev))
4743                         return -ENODEV;
4744                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4745
4746         case SIOCSIFTXQLEN:
4747                 if (ifr->ifr_qlen < 0)
4748                         return -EINVAL;
4749                 dev->tx_queue_len = ifr->ifr_qlen;
4750                 return 0;
4751
4752         case SIOCSIFNAME:
4753                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4754                 return dev_change_name(dev, ifr->ifr_newname);
4755
4756         /*
4757          *      Unknown or private ioctl
4758          */
4759         default:
4760                 if ((cmd >= SIOCDEVPRIVATE &&
4761                     cmd <= SIOCDEVPRIVATE + 15) ||
4762                     cmd == SIOCBONDENSLAVE ||
4763                     cmd == SIOCBONDRELEASE ||
4764                     cmd == SIOCBONDSETHWADDR ||
4765                     cmd == SIOCBONDSLAVEINFOQUERY ||
4766                     cmd == SIOCBONDINFOQUERY ||
4767                     cmd == SIOCBONDCHANGEACTIVE ||
4768                     cmd == SIOCGMIIPHY ||
4769                     cmd == SIOCGMIIREG ||
4770                     cmd == SIOCSMIIREG ||
4771                     cmd == SIOCBRADDIF ||
4772                     cmd == SIOCBRDELIF ||
4773                     cmd == SIOCSHWTSTAMP ||
4774                     cmd == SIOCWANDEV) {
4775                         err = -EOPNOTSUPP;
4776                         if (ops->ndo_do_ioctl) {
4777                                 if (netif_device_present(dev))
4778                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4779                                 else
4780                                         err = -ENODEV;
4781                         }
4782                 } else
4783                         err = -EINVAL;
4784
4785         }
4786         return err;
4787 }
4788
4789 /*
4790  *      This function handles all "interface"-type I/O control requests. The actual
4791  *      'doing' part of this is dev_ifsioc above.
4792  */
4793
4794 /**
4795  *      dev_ioctl       -       network device ioctl
4796  *      @net: the applicable net namespace
4797  *      @cmd: command to issue
4798  *      @arg: pointer to a struct ifreq in user space
4799  *
4800  *      Issue ioctl functions to devices. This is normally called by the
4801  *      user space syscall interfaces but can sometimes be useful for
4802  *      other purposes. The return value is the return from the syscall if
4803  *      positive or a negative errno code on error.
4804  */
4805
4806 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4807 {
4808         struct ifreq ifr;
4809         int ret;
4810         char *colon;
4811
4812         /* One special case: SIOCGIFCONF takes ifconf argument
4813            and requires shared lock, because it sleeps writing
4814            to user space.
4815          */
4816
4817         if (cmd == SIOCGIFCONF) {
4818                 rtnl_lock();
4819                 ret = dev_ifconf(net, (char __user *) arg);
4820                 rtnl_unlock();
4821                 return ret;
4822         }
4823         if (cmd == SIOCGIFNAME)
4824                 return dev_ifname(net, (struct ifreq __user *)arg);
4825
4826         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4827                 return -EFAULT;
4828
4829         ifr.ifr_name[IFNAMSIZ-1] = 0;
4830
4831         colon = strchr(ifr.ifr_name, ':');
4832         if (colon)
4833                 *colon = 0;
4834
4835         /*
4836          *      See which interface the caller is talking about.
4837          */
4838
4839         switch (cmd) {
4840         /*
4841          *      These ioctl calls:
4842          *      - can be done by all.
4843          *      - atomic and do not require locking.
4844          *      - return a value
4845          */
4846         case SIOCGIFFLAGS:
4847         case SIOCGIFMETRIC:
4848         case SIOCGIFMTU:
4849         case SIOCGIFHWADDR:
4850         case SIOCGIFSLAVE:
4851         case SIOCGIFMAP:
4852         case SIOCGIFINDEX:
4853         case SIOCGIFTXQLEN:
4854                 dev_load(net, ifr.ifr_name);
4855                 rcu_read_lock();
4856                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4857                 rcu_read_unlock();
4858                 if (!ret) {
4859                         if (colon)
4860                                 *colon = ':';
4861                         if (copy_to_user(arg, &ifr,
4862                                          sizeof(struct ifreq)))
4863                                 ret = -EFAULT;
4864                 }
4865                 return ret;
4866
4867         case SIOCETHTOOL:
4868                 dev_load(net, ifr.ifr_name);
4869                 rtnl_lock();
4870                 ret = dev_ethtool(net, &ifr);
4871                 rtnl_unlock();
4872                 if (!ret) {
4873                         if (colon)
4874                                 *colon = ':';
4875                         if (copy_to_user(arg, &ifr,
4876                                          sizeof(struct ifreq)))
4877                                 ret = -EFAULT;
4878                 }
4879                 return ret;
4880
4881         /*
4882          *      These ioctl calls:
4883          *      - require superuser power.
4884          *      - require strict serialization.
4885          *      - return a value
4886          */
4887         case SIOCGMIIPHY:
4888         case SIOCGMIIREG:
4889         case SIOCSIFNAME:
4890                 if (!capable(CAP_NET_ADMIN))
4891                         return -EPERM;
4892                 dev_load(net, ifr.ifr_name);
4893                 rtnl_lock();
4894                 ret = dev_ifsioc(net, &ifr, cmd);
4895                 rtnl_unlock();
4896                 if (!ret) {
4897                         if (colon)
4898                                 *colon = ':';
4899                         if (copy_to_user(arg, &ifr,
4900                                          sizeof(struct ifreq)))
4901                                 ret = -EFAULT;
4902                 }
4903                 return ret;
4904
4905         /*
4906          *      These ioctl calls:
4907          *      - require superuser power.
4908          *      - require strict serialization.
4909          *      - do not return a value
4910          */
4911         case SIOCSIFFLAGS:
4912         case SIOCSIFMETRIC:
4913         case SIOCSIFMTU:
4914         case SIOCSIFMAP:
4915         case SIOCSIFHWADDR:
4916         case SIOCSIFSLAVE:
4917         case SIOCADDMULTI:
4918         case SIOCDELMULTI:
4919         case SIOCSIFHWBROADCAST:
4920         case SIOCSIFTXQLEN:
4921         case SIOCSMIIREG:
4922         case SIOCBONDENSLAVE:
4923         case SIOCBONDRELEASE:
4924         case SIOCBONDSETHWADDR:
4925         case SIOCBONDCHANGEACTIVE:
4926         case SIOCBRADDIF:
4927         case SIOCBRDELIF:
4928         case SIOCSHWTSTAMP:
4929                 if (!capable(CAP_NET_ADMIN))
4930                         return -EPERM;
4931                 /* fall through */
4932         case SIOCBONDSLAVEINFOQUERY:
4933         case SIOCBONDINFOQUERY:
4934                 dev_load(net, ifr.ifr_name);
4935                 rtnl_lock();
4936                 ret = dev_ifsioc(net, &ifr, cmd);
4937                 rtnl_unlock();
4938                 return ret;
4939
4940         case SIOCGIFMEM:
4941                 /* Get the per device memory space. We can add this but
4942                  * currently do not support it */
4943         case SIOCSIFMEM:
4944                 /* Set the per device memory buffer space.
4945                  * Not applicable in our case */
4946         case SIOCSIFLINK:
4947                 return -EINVAL;
4948
4949         /*
4950          *      Unknown or private ioctl.
4951          */
4952         default:
4953                 if (cmd == SIOCWANDEV ||
4954                     (cmd >= SIOCDEVPRIVATE &&
4955                      cmd <= SIOCDEVPRIVATE + 15)) {
4956                         dev_load(net, ifr.ifr_name);
4957                         rtnl_lock();
4958                         ret = dev_ifsioc(net, &ifr, cmd);
4959                         rtnl_unlock();
4960                         if (!ret && copy_to_user(arg, &ifr,
4961                                                  sizeof(struct ifreq)))
4962                                 ret = -EFAULT;
4963                         return ret;
4964                 }
4965                 /* Take care of Wireless Extensions */
4966                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4967                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4968                 return -EINVAL;
4969         }
4970 }
4971
4972
4973 /**
4974  *      dev_new_index   -       allocate an ifindex
4975  *      @net: the applicable net namespace
4976  *
4977  *      Returns a suitable unique value for a new device interface
4978  *      number.  The caller must hold the rtnl semaphore or the
4979  *      dev_base_lock to be sure it remains unique.
4980  */
4981 static int dev_new_index(struct net *net)
4982 {
4983         static int ifindex;
4984         for (;;) {
4985                 if (++ifindex <= 0)
4986                         ifindex = 1;
4987                 if (!__dev_get_by_index(net, ifindex))
4988                         return ifindex;
4989         }
4990 }
4991
4992 /* Delayed registration/unregisteration */
4993 static LIST_HEAD(net_todo_list);
4994
4995 static void net_set_todo(struct net_device *dev)
4996 {
4997         list_add_tail(&dev->todo_list, &net_todo_list);
4998 }
4999
5000 static void rollback_registered_many(struct list_head *head)
5001 {
5002         struct net_device *dev, *tmp;
5003
5004         BUG_ON(dev_boot_phase);
5005         ASSERT_RTNL();
5006
5007         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5008                 /* Some devices call without registering
5009                  * for initialization unwind. Remove those
5010                  * devices and proceed with the remaining.
5011                  */
5012                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5013                         pr_debug("unregister_netdevice: device %s/%p never "
5014                                  "was registered\n", dev->name, dev);
5015
5016                         WARN_ON(1);
5017                         list_del(&dev->unreg_list);
5018                         continue;
5019                 }
5020
5021                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5022         }
5023
5024         /* If device is running, close it first. */
5025         dev_close_many(head);
5026
5027         list_for_each_entry(dev, head, unreg_list) {
5028                 /* And unlink it from device chain. */
5029                 unlist_netdevice(dev);
5030
5031                 dev->reg_state = NETREG_UNREGISTERING;
5032         }
5033
5034         synchronize_net();
5035
5036         list_for_each_entry(dev, head, unreg_list) {
5037                 /* Shutdown queueing discipline. */
5038                 dev_shutdown(dev);
5039
5040
5041                 /* Notify protocols, that we are about to destroy
5042                    this device. They should clean all the things.
5043                 */
5044                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5045
5046                 if (!dev->rtnl_link_ops ||
5047                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5048                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5049
5050                 /*
5051                  *      Flush the unicast and multicast chains
5052                  */
5053                 dev_uc_flush(dev);
5054                 dev_mc_flush(dev);
5055
5056                 if (dev->netdev_ops->ndo_uninit)
5057                         dev->netdev_ops->ndo_uninit(dev);
5058
5059                 /* Notifier chain MUST detach us from master device. */
5060                 WARN_ON(dev->master);
5061
5062                 /* Remove entries from kobject tree */
5063                 netdev_unregister_kobject(dev);
5064         }
5065
5066         /* Process any work delayed until the end of the batch */
5067         dev = list_first_entry(head, struct net_device, unreg_list);
5068         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5069
5070         rcu_barrier();
5071
5072         list_for_each_entry(dev, head, unreg_list)
5073                 dev_put(dev);
5074 }
5075
5076 static void rollback_registered(struct net_device *dev)
5077 {
5078         LIST_HEAD(single);
5079
5080         list_add(&dev->unreg_list, &single);
5081         rollback_registered_many(&single);
5082 }
5083
5084 unsigned long netdev_fix_features(unsigned long features, const char *name)
5085 {
5086         /* Fix illegal SG+CSUM combinations. */
5087         if ((features & NETIF_F_SG) &&
5088             !(features & NETIF_F_ALL_CSUM)) {
5089                 if (name)
5090                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5091                                "checksum feature.\n", name);
5092                 features &= ~NETIF_F_SG;
5093         }
5094
5095         /* TSO requires that SG is present as well. */
5096         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5097                 if (name)
5098                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5099                                "SG feature.\n", name);
5100                 features &= ~NETIF_F_TSO;
5101         }
5102
5103         if (features & NETIF_F_UFO) {
5104                 /* maybe split UFO into V4 and V6? */
5105                 if (!((features & NETIF_F_GEN_CSUM) ||
5106                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5107                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5108                         if (name)
5109                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5110                                        "since no checksum offload features.\n",
5111                                        name);
5112                         features &= ~NETIF_F_UFO;
5113                 }
5114
5115                 if (!(features & NETIF_F_SG)) {
5116                         if (name)
5117                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5118                                        "since no NETIF_F_SG feature.\n", name);
5119                         features &= ~NETIF_F_UFO;
5120                 }
5121         }
5122
5123         return features;
5124 }
5125 EXPORT_SYMBOL(netdev_fix_features);
5126
5127 /**
5128  *      netif_stacked_transfer_operstate -      transfer operstate
5129  *      @rootdev: the root or lower level device to transfer state from
5130  *      @dev: the device to transfer operstate to
5131  *
5132  *      Transfer operational state from root to device. This is normally
5133  *      called when a stacking relationship exists between the root
5134  *      device and the device(a leaf device).
5135  */
5136 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5137                                         struct net_device *dev)
5138 {
5139         if (rootdev->operstate == IF_OPER_DORMANT)
5140                 netif_dormant_on(dev);
5141         else
5142                 netif_dormant_off(dev);
5143
5144         if (netif_carrier_ok(rootdev)) {
5145                 if (!netif_carrier_ok(dev))
5146                         netif_carrier_on(dev);
5147         } else {
5148                 if (netif_carrier_ok(dev))
5149                         netif_carrier_off(dev);
5150         }
5151 }
5152 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5153
5154 #ifdef CONFIG_RPS
5155 static int netif_alloc_rx_queues(struct net_device *dev)
5156 {
5157         unsigned int i, count = dev->num_rx_queues;
5158         struct netdev_rx_queue *rx;
5159
5160         BUG_ON(count < 1);
5161
5162         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5163         if (!rx) {
5164                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5165                 return -ENOMEM;
5166         }
5167         dev->_rx = rx;
5168
5169         for (i = 0; i < count; i++)
5170                 rx[i].dev = dev;
5171         return 0;
5172 }
5173 #endif
5174
5175 static void netdev_init_one_queue(struct net_device *dev,
5176                                   struct netdev_queue *queue, void *_unused)
5177 {
5178         /* Initialize queue lock */
5179         spin_lock_init(&queue->_xmit_lock);
5180         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5181         queue->xmit_lock_owner = -1;
5182         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5183         queue->dev = dev;
5184 }
5185
5186 static int netif_alloc_netdev_queues(struct net_device *dev)
5187 {
5188         unsigned int count = dev->num_tx_queues;
5189         struct netdev_queue *tx;
5190
5191         BUG_ON(count < 1);
5192
5193         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5194         if (!tx) {
5195                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5196                        count);
5197                 return -ENOMEM;
5198         }
5199         dev->_tx = tx;
5200
5201         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5202         spin_lock_init(&dev->tx_global_lock);
5203
5204         return 0;
5205 }
5206
5207 /**
5208  *      register_netdevice      - register a network device
5209  *      @dev: device to register
5210  *
5211  *      Take a completed network device structure and add it to the kernel
5212  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5213  *      chain. 0 is returned on success. A negative errno code is returned
5214  *      on a failure to set up the device, or if the name is a duplicate.
5215  *
5216  *      Callers must hold the rtnl semaphore. You may want
5217  *      register_netdev() instead of this.
5218  *
5219  *      BUGS:
5220  *      The locking appears insufficient to guarantee two parallel registers
5221  *      will not get the same name.
5222  */
5223
5224 int register_netdevice(struct net_device *dev)
5225 {
5226         int ret;
5227         struct net *net = dev_net(dev);
5228
5229         BUG_ON(dev_boot_phase);
5230         ASSERT_RTNL();
5231
5232         might_sleep();
5233
5234         /* When net_device's are persistent, this will be fatal. */
5235         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5236         BUG_ON(!net);
5237
5238         spin_lock_init(&dev->addr_list_lock);
5239         netdev_set_addr_lockdep_class(dev);
5240
5241         dev->iflink = -1;
5242
5243         /* Init, if this function is available */
5244         if (dev->netdev_ops->ndo_init) {
5245                 ret = dev->netdev_ops->ndo_init(dev);
5246                 if (ret) {
5247                         if (ret > 0)
5248                                 ret = -EIO;
5249                         goto out;
5250                 }
5251         }
5252
5253         ret = dev_get_valid_name(dev, dev->name, 0);
5254         if (ret)
5255                 goto err_uninit;
5256
5257         dev->ifindex = dev_new_index(net);
5258         if (dev->iflink == -1)
5259                 dev->iflink = dev->ifindex;
5260
5261         /* Fix illegal checksum combinations */
5262         if ((dev->features & NETIF_F_HW_CSUM) &&
5263             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5264                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5265                        dev->name);
5266                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5267         }
5268
5269         if ((dev->features & NETIF_F_NO_CSUM) &&
5270             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5271                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5272                        dev->name);
5273                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5274         }
5275
5276         dev->features = netdev_fix_features(dev->features, dev->name);
5277
5278         /* Enable software GSO if SG is supported. */
5279         if (dev->features & NETIF_F_SG)
5280                 dev->features |= NETIF_F_GSO;
5281
5282         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5283          * vlan_dev_init() will do the dev->features check, so these features
5284          * are enabled only if supported by underlying device.
5285          */
5286         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5287
5288         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5289         ret = notifier_to_errno(ret);
5290         if (ret)
5291                 goto err_uninit;
5292
5293         ret = netdev_register_kobject(dev);
5294         if (ret)
5295                 goto err_uninit;
5296         dev->reg_state = NETREG_REGISTERED;
5297
5298         /*
5299          *      Default initial state at registry is that the
5300          *      device is present.
5301          */
5302
5303         set_bit(__LINK_STATE_PRESENT, &dev->state);
5304
5305         dev_init_scheduler(dev);
5306         dev_hold(dev);
5307         list_netdevice(dev);
5308
5309         /* Notify protocols, that a new device appeared. */
5310         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5311         ret = notifier_to_errno(ret);
5312         if (ret) {
5313                 rollback_registered(dev);
5314                 dev->reg_state = NETREG_UNREGISTERED;
5315         }
5316         /*
5317          *      Prevent userspace races by waiting until the network
5318          *      device is fully setup before sending notifications.
5319          */
5320         if (!dev->rtnl_link_ops ||
5321             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5322                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5323
5324 out:
5325         return ret;
5326
5327 err_uninit:
5328         if (dev->netdev_ops->ndo_uninit)
5329                 dev->netdev_ops->ndo_uninit(dev);
5330         goto out;
5331 }
5332 EXPORT_SYMBOL(register_netdevice);
5333
5334 /**
5335  *      init_dummy_netdev       - init a dummy network device for NAPI
5336  *      @dev: device to init
5337  *
5338  *      This takes a network device structure and initialize the minimum
5339  *      amount of fields so it can be used to schedule NAPI polls without
5340  *      registering a full blown interface. This is to be used by drivers
5341  *      that need to tie several hardware interfaces to a single NAPI
5342  *      poll scheduler due to HW limitations.
5343  */
5344 int init_dummy_netdev(struct net_device *dev)
5345 {
5346         /* Clear everything. Note we don't initialize spinlocks
5347          * are they aren't supposed to be taken by any of the
5348          * NAPI code and this dummy netdev is supposed to be
5349          * only ever used for NAPI polls
5350          */
5351         memset(dev, 0, sizeof(struct net_device));
5352
5353         /* make sure we BUG if trying to hit standard
5354          * register/unregister code path
5355          */
5356         dev->reg_state = NETREG_DUMMY;
5357
5358         /* NAPI wants this */
5359         INIT_LIST_HEAD(&dev->napi_list);
5360
5361         /* a dummy interface is started by default */
5362         set_bit(__LINK_STATE_PRESENT, &dev->state);
5363         set_bit(__LINK_STATE_START, &dev->state);
5364
5365         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5366          * because users of this 'device' dont need to change
5367          * its refcount.
5368          */
5369
5370         return 0;
5371 }
5372 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5373
5374
5375 /**
5376  *      register_netdev - register a network device
5377  *      @dev: device to register
5378  *
5379  *      Take a completed network device structure and add it to the kernel
5380  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5381  *      chain. 0 is returned on success. A negative errno code is returned
5382  *      on a failure to set up the device, or if the name is a duplicate.
5383  *
5384  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5385  *      and expands the device name if you passed a format string to
5386  *      alloc_netdev.
5387  */
5388 int register_netdev(struct net_device *dev)
5389 {
5390         int err;
5391
5392         rtnl_lock();
5393
5394         /*
5395          * If the name is a format string the caller wants us to do a
5396          * name allocation.
5397          */
5398         if (strchr(dev->name, '%')) {
5399                 err = dev_alloc_name(dev, dev->name);
5400                 if (err < 0)
5401                         goto out;
5402         }
5403
5404         err = register_netdevice(dev);
5405 out:
5406         rtnl_unlock();
5407         return err;
5408 }
5409 EXPORT_SYMBOL(register_netdev);
5410
5411 int netdev_refcnt_read(const struct net_device *dev)
5412 {
5413         int i, refcnt = 0;
5414
5415         for_each_possible_cpu(i)
5416                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5417         return refcnt;
5418 }
5419 EXPORT_SYMBOL(netdev_refcnt_read);
5420
5421 /*
5422  * netdev_wait_allrefs - wait until all references are gone.
5423  *
5424  * This is called when unregistering network devices.
5425  *
5426  * Any protocol or device that holds a reference should register
5427  * for netdevice notification, and cleanup and put back the
5428  * reference if they receive an UNREGISTER event.
5429  * We can get stuck here if buggy protocols don't correctly
5430  * call dev_put.
5431  */
5432 static void netdev_wait_allrefs(struct net_device *dev)
5433 {
5434         unsigned long rebroadcast_time, warning_time;
5435         int refcnt;
5436
5437         linkwatch_forget_dev(dev);
5438
5439         rebroadcast_time = warning_time = jiffies;
5440         refcnt = netdev_refcnt_read(dev);
5441
5442         while (refcnt != 0) {
5443                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5444                         rtnl_lock();
5445
5446                         /* Rebroadcast unregister notification */
5447                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5448                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5449                          * should have already handle it the first time */
5450
5451                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5452                                      &dev->state)) {
5453                                 /* We must not have linkwatch events
5454                                  * pending on unregister. If this
5455                                  * happens, we simply run the queue
5456                                  * unscheduled, resulting in a noop
5457                                  * for this device.
5458                                  */
5459                                 linkwatch_run_queue();
5460                         }
5461
5462                         __rtnl_unlock();
5463
5464                         rebroadcast_time = jiffies;
5465                 }
5466
5467                 msleep(250);
5468
5469                 refcnt = netdev_refcnt_read(dev);
5470
5471                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5472                         printk(KERN_EMERG "unregister_netdevice: "
5473                                "waiting for %s to become free. Usage "
5474                                "count = %d\n",
5475                                dev->name, refcnt);
5476                         warning_time = jiffies;
5477                 }
5478         }
5479 }
5480
5481 /* The sequence is:
5482  *
5483  *      rtnl_lock();
5484  *      ...
5485  *      register_netdevice(x1);
5486  *      register_netdevice(x2);
5487  *      ...
5488  *      unregister_netdevice(y1);
5489  *      unregister_netdevice(y2);
5490  *      ...
5491  *      rtnl_unlock();
5492  *      free_netdev(y1);
5493  *      free_netdev(y2);
5494  *
5495  * We are invoked by rtnl_unlock().
5496  * This allows us to deal with problems:
5497  * 1) We can delete sysfs objects which invoke hotplug
5498  *    without deadlocking with linkwatch via keventd.
5499  * 2) Since we run with the RTNL semaphore not held, we can sleep
5500  *    safely in order to wait for the netdev refcnt to drop to zero.
5501  *
5502  * We must not return until all unregister events added during
5503  * the interval the lock was held have been completed.
5504  */
5505 void netdev_run_todo(void)
5506 {
5507         struct list_head list;
5508
5509         /* Snapshot list, allow later requests */
5510         list_replace_init(&net_todo_list, &list);
5511
5512         __rtnl_unlock();
5513
5514         while (!list_empty(&list)) {
5515                 struct net_device *dev
5516                         = list_first_entry(&list, struct net_device, todo_list);
5517                 list_del(&dev->todo_list);
5518
5519                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5520                         printk(KERN_ERR "network todo '%s' but state %d\n",
5521                                dev->name, dev->reg_state);
5522                         dump_stack();
5523                         continue;
5524                 }
5525
5526                 dev->reg_state = NETREG_UNREGISTERED;
5527
5528                 on_each_cpu(flush_backlog, dev, 1);
5529
5530                 netdev_wait_allrefs(dev);
5531
5532                 /* paranoia */
5533                 BUG_ON(netdev_refcnt_read(dev));
5534                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5535                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5536                 WARN_ON(dev->dn_ptr);
5537
5538                 if (dev->destructor)
5539                         dev->destructor(dev);
5540
5541                 /* Free network device */
5542                 kobject_put(&dev->dev.kobj);
5543         }
5544 }
5545
5546 /**
5547  *      dev_txq_stats_fold - fold tx_queues stats
5548  *      @dev: device to get statistics from
5549  *      @stats: struct rtnl_link_stats64 to hold results
5550  */
5551 void dev_txq_stats_fold(const struct net_device *dev,
5552                         struct rtnl_link_stats64 *stats)
5553 {
5554         u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5555         unsigned int i;
5556         struct netdev_queue *txq;
5557
5558         for (i = 0; i < dev->num_tx_queues; i++) {
5559                 txq = netdev_get_tx_queue(dev, i);
5560                 spin_lock_bh(&txq->_xmit_lock);
5561                 tx_bytes   += txq->tx_bytes;
5562                 tx_packets += txq->tx_packets;
5563                 tx_dropped += txq->tx_dropped;
5564                 spin_unlock_bh(&txq->_xmit_lock);
5565         }
5566         if (tx_bytes || tx_packets || tx_dropped) {
5567                 stats->tx_bytes   = tx_bytes;
5568                 stats->tx_packets = tx_packets;
5569                 stats->tx_dropped = tx_dropped;
5570         }
5571 }
5572 EXPORT_SYMBOL(dev_txq_stats_fold);
5573
5574 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5575  * fields in the same order, with only the type differing.
5576  */
5577 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5578                                     const struct net_device_stats *netdev_stats)
5579 {
5580 #if BITS_PER_LONG == 64
5581         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5582         memcpy(stats64, netdev_stats, sizeof(*stats64));
5583 #else
5584         size_t i, n = sizeof(*stats64) / sizeof(u64);
5585         const unsigned long *src = (const unsigned long *)netdev_stats;
5586         u64 *dst = (u64 *)stats64;
5587
5588         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5589                      sizeof(*stats64) / sizeof(u64));
5590         for (i = 0; i < n; i++)
5591                 dst[i] = src[i];
5592 #endif
5593 }
5594
5595 /**
5596  *      dev_get_stats   - get network device statistics
5597  *      @dev: device to get statistics from
5598  *      @storage: place to store stats
5599  *
5600  *      Get network statistics from device. Return @storage.
5601  *      The device driver may provide its own method by setting
5602  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5603  *      otherwise the internal statistics structure is used.
5604  */
5605 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5606                                         struct rtnl_link_stats64 *storage)
5607 {
5608         const struct net_device_ops *ops = dev->netdev_ops;
5609
5610         if (ops->ndo_get_stats64) {
5611                 memset(storage, 0, sizeof(*storage));
5612                 ops->ndo_get_stats64(dev, storage);
5613         } else if (ops->ndo_get_stats) {
5614                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5615         } else {
5616                 netdev_stats_to_stats64(storage, &dev->stats);
5617                 dev_txq_stats_fold(dev, storage);
5618         }
5619         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5620         return storage;
5621 }
5622 EXPORT_SYMBOL(dev_get_stats);
5623
5624 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5625 {
5626         struct netdev_queue *queue = dev_ingress_queue(dev);
5627
5628 #ifdef CONFIG_NET_CLS_ACT
5629         if (queue)
5630                 return queue;
5631         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5632         if (!queue)
5633                 return NULL;
5634         netdev_init_one_queue(dev, queue, NULL);
5635         queue->qdisc = &noop_qdisc;
5636         queue->qdisc_sleeping = &noop_qdisc;
5637         rcu_assign_pointer(dev->ingress_queue, queue);
5638 #endif
5639         return queue;
5640 }
5641
5642 /**
5643  *      alloc_netdev_mq - allocate network device
5644  *      @sizeof_priv:   size of private data to allocate space for
5645  *      @name:          device name format string
5646  *      @setup:         callback to initialize device
5647  *      @queue_count:   the number of subqueues to allocate
5648  *
5649  *      Allocates a struct net_device with private data area for driver use
5650  *      and performs basic initialization.  Also allocates subquue structs
5651  *      for each queue on the device at the end of the netdevice.
5652  */
5653 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5654                 void (*setup)(struct net_device *), unsigned int queue_count)
5655 {
5656         struct net_device *dev;
5657         size_t alloc_size;
5658         struct net_device *p;
5659
5660         BUG_ON(strlen(name) >= sizeof(dev->name));
5661
5662         if (queue_count < 1) {
5663                 pr_err("alloc_netdev: Unable to allocate device "
5664                        "with zero queues.\n");
5665                 return NULL;
5666         }
5667
5668         alloc_size = sizeof(struct net_device);
5669         if (sizeof_priv) {
5670                 /* ensure 32-byte alignment of private area */
5671                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5672                 alloc_size += sizeof_priv;
5673         }
5674         /* ensure 32-byte alignment of whole construct */
5675         alloc_size += NETDEV_ALIGN - 1;
5676
5677         p = kzalloc(alloc_size, GFP_KERNEL);
5678         if (!p) {
5679                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5680                 return NULL;
5681         }
5682
5683         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5684         dev->padded = (char *)dev - (char *)p;
5685
5686         dev->pcpu_refcnt = alloc_percpu(int);
5687         if (!dev->pcpu_refcnt)
5688                 goto free_p;
5689
5690         if (dev_addr_init(dev))
5691                 goto free_pcpu;
5692
5693         dev_mc_init(dev);
5694         dev_uc_init(dev);
5695
5696         dev_net_set(dev, &init_net);
5697
5698         dev->num_tx_queues = queue_count;
5699         dev->real_num_tx_queues = queue_count;
5700         if (netif_alloc_netdev_queues(dev))
5701                 goto free_pcpu;
5702
5703 #ifdef CONFIG_RPS
5704         dev->num_rx_queues = queue_count;
5705         dev->real_num_rx_queues = queue_count;
5706         if (netif_alloc_rx_queues(dev))
5707                 goto free_pcpu;
5708 #endif
5709
5710         dev->gso_max_size = GSO_MAX_SIZE;
5711
5712         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5713         dev->ethtool_ntuple_list.count = 0;
5714         INIT_LIST_HEAD(&dev->napi_list);
5715         INIT_LIST_HEAD(&dev->unreg_list);
5716         INIT_LIST_HEAD(&dev->link_watch_list);
5717         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5718         setup(dev);
5719         strcpy(dev->name, name);
5720         return dev;
5721
5722 free_pcpu:
5723         free_percpu(dev->pcpu_refcnt);
5724         kfree(dev->_tx);
5725 #ifdef CONFIG_RPS
5726         kfree(dev->_rx);
5727 #endif
5728
5729 free_p:
5730         kfree(p);
5731         return NULL;
5732 }
5733 EXPORT_SYMBOL(alloc_netdev_mq);
5734
5735 /**
5736  *      free_netdev - free network device
5737  *      @dev: device
5738  *
5739  *      This function does the last stage of destroying an allocated device
5740  *      interface. The reference to the device object is released.
5741  *      If this is the last reference then it will be freed.
5742  */
5743 void free_netdev(struct net_device *dev)
5744 {
5745         struct napi_struct *p, *n;
5746
5747         release_net(dev_net(dev));
5748
5749         kfree(dev->_tx);
5750 #ifdef CONFIG_RPS
5751         kfree(dev->_rx);
5752 #endif
5753
5754         kfree(rcu_dereference_raw(dev->ingress_queue));
5755
5756         /* Flush device addresses */
5757         dev_addr_flush(dev);
5758
5759         /* Clear ethtool n-tuple list */
5760         ethtool_ntuple_flush(dev);
5761
5762         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5763                 netif_napi_del(p);
5764
5765         free_percpu(dev->pcpu_refcnt);
5766         dev->pcpu_refcnt = NULL;
5767
5768         /*  Compatibility with error handling in drivers */
5769         if (dev->reg_state == NETREG_UNINITIALIZED) {
5770                 kfree((char *)dev - dev->padded);
5771                 return;
5772         }
5773
5774         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5775         dev->reg_state = NETREG_RELEASED;
5776
5777         /* will free via device release */
5778         put_device(&dev->dev);
5779 }
5780 EXPORT_SYMBOL(free_netdev);
5781
5782 /**
5783  *      synchronize_net -  Synchronize with packet receive processing
5784  *
5785  *      Wait for packets currently being received to be done.
5786  *      Does not block later packets from starting.
5787  */
5788 void synchronize_net(void)
5789 {
5790         might_sleep();
5791         synchronize_rcu();
5792 }
5793 EXPORT_SYMBOL(synchronize_net);
5794
5795 /**
5796  *      unregister_netdevice_queue - remove device from the kernel
5797  *      @dev: device
5798  *      @head: list
5799  *
5800  *      This function shuts down a device interface and removes it
5801  *      from the kernel tables.
5802  *      If head not NULL, device is queued to be unregistered later.
5803  *
5804  *      Callers must hold the rtnl semaphore.  You may want
5805  *      unregister_netdev() instead of this.
5806  */
5807
5808 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5809 {
5810         ASSERT_RTNL();
5811
5812         if (head) {
5813                 list_move_tail(&dev->unreg_list, head);
5814         } else {
5815                 rollback_registered(dev);
5816                 /* Finish processing unregister after unlock */
5817                 net_set_todo(dev);
5818         }
5819 }
5820 EXPORT_SYMBOL(unregister_netdevice_queue);
5821
5822 /**
5823  *      unregister_netdevice_many - unregister many devices
5824  *      @head: list of devices
5825  */
5826 void unregister_netdevice_many(struct list_head *head)
5827 {
5828         struct net_device *dev;
5829
5830         if (!list_empty(head)) {
5831                 rollback_registered_many(head);
5832                 list_for_each_entry(dev, head, unreg_list)
5833                         net_set_todo(dev);
5834         }
5835 }
5836 EXPORT_SYMBOL(unregister_netdevice_many);
5837
5838 /**
5839  *      unregister_netdev - remove device from the kernel
5840  *      @dev: device
5841  *
5842  *      This function shuts down a device interface and removes it
5843  *      from the kernel tables.
5844  *
5845  *      This is just a wrapper for unregister_netdevice that takes
5846  *      the rtnl semaphore.  In general you want to use this and not
5847  *      unregister_netdevice.
5848  */
5849 void unregister_netdev(struct net_device *dev)
5850 {
5851         rtnl_lock();
5852         unregister_netdevice(dev);
5853         rtnl_unlock();
5854 }
5855 EXPORT_SYMBOL(unregister_netdev);
5856
5857 /**
5858  *      dev_change_net_namespace - move device to different nethost namespace
5859  *      @dev: device
5860  *      @net: network namespace
5861  *      @pat: If not NULL name pattern to try if the current device name
5862  *            is already taken in the destination network namespace.
5863  *
5864  *      This function shuts down a device interface and moves it
5865  *      to a new network namespace. On success 0 is returned, on
5866  *      a failure a netagive errno code is returned.
5867  *
5868  *      Callers must hold the rtnl semaphore.
5869  */
5870
5871 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5872 {
5873         int err;
5874
5875         ASSERT_RTNL();
5876
5877         /* Don't allow namespace local devices to be moved. */
5878         err = -EINVAL;
5879         if (dev->features & NETIF_F_NETNS_LOCAL)
5880                 goto out;
5881
5882         /* Ensure the device has been registrered */
5883         err = -EINVAL;
5884         if (dev->reg_state != NETREG_REGISTERED)
5885                 goto out;
5886
5887         /* Get out if there is nothing todo */
5888         err = 0;
5889         if (net_eq(dev_net(dev), net))
5890                 goto out;
5891
5892         /* Pick the destination device name, and ensure
5893          * we can use it in the destination network namespace.
5894          */
5895         err = -EEXIST;
5896         if (__dev_get_by_name(net, dev->name)) {
5897                 /* We get here if we can't use the current device name */
5898                 if (!pat)
5899                         goto out;
5900                 if (dev_get_valid_name(dev, pat, 1))
5901                         goto out;
5902         }
5903
5904         /*
5905          * And now a mini version of register_netdevice unregister_netdevice.
5906          */
5907
5908         /* If device is running close it first. */
5909         dev_close(dev);
5910
5911         /* And unlink it from device chain */
5912         err = -ENODEV;
5913         unlist_netdevice(dev);
5914
5915         synchronize_net();
5916
5917         /* Shutdown queueing discipline. */
5918         dev_shutdown(dev);
5919
5920         /* Notify protocols, that we are about to destroy
5921            this device. They should clean all the things.
5922
5923            Note that dev->reg_state stays at NETREG_REGISTERED.
5924            This is wanted because this way 8021q and macvlan know
5925            the device is just moving and can keep their slaves up.
5926         */
5927         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5928         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5929
5930         /*
5931          *      Flush the unicast and multicast chains
5932          */
5933         dev_uc_flush(dev);
5934         dev_mc_flush(dev);
5935
5936         /* Actually switch the network namespace */
5937         dev_net_set(dev, net);
5938
5939         /* If there is an ifindex conflict assign a new one */
5940         if (__dev_get_by_index(net, dev->ifindex)) {
5941                 int iflink = (dev->iflink == dev->ifindex);
5942                 dev->ifindex = dev_new_index(net);
5943                 if (iflink)
5944                         dev->iflink = dev->ifindex;
5945         }
5946
5947         /* Fixup kobjects */
5948         err = device_rename(&dev->dev, dev->name);
5949         WARN_ON(err);
5950
5951         /* Add the device back in the hashes */
5952         list_netdevice(dev);
5953
5954         /* Notify protocols, that a new device appeared. */
5955         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5956
5957         /*
5958          *      Prevent userspace races by waiting until the network
5959          *      device is fully setup before sending notifications.
5960          */
5961         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5962
5963         synchronize_net();
5964         err = 0;
5965 out:
5966         return err;
5967 }
5968 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5969
5970 static int dev_cpu_callback(struct notifier_block *nfb,
5971                             unsigned long action,
5972                             void *ocpu)
5973 {
5974         struct sk_buff **list_skb;
5975         struct sk_buff *skb;
5976         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5977         struct softnet_data *sd, *oldsd;
5978
5979         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5980                 return NOTIFY_OK;
5981
5982         local_irq_disable();
5983         cpu = smp_processor_id();
5984         sd = &per_cpu(softnet_data, cpu);
5985         oldsd = &per_cpu(softnet_data, oldcpu);
5986
5987         /* Find end of our completion_queue. */
5988         list_skb = &sd->completion_queue;
5989         while (*list_skb)
5990                 list_skb = &(*list_skb)->next;
5991         /* Append completion queue from offline CPU. */
5992         *list_skb = oldsd->completion_queue;
5993         oldsd->completion_queue = NULL;
5994
5995         /* Append output queue from offline CPU. */
5996         if (oldsd->output_queue) {
5997                 *sd->output_queue_tailp = oldsd->output_queue;
5998                 sd->output_queue_tailp = oldsd->output_queue_tailp;
5999                 oldsd->output_queue = NULL;
6000                 oldsd->output_queue_tailp = &oldsd->output_queue;
6001         }
6002
6003         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6004         local_irq_enable();
6005
6006         /* Process offline CPU's input_pkt_queue */
6007         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6008                 netif_rx(skb);
6009                 input_queue_head_incr(oldsd);
6010         }
6011         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6012                 netif_rx(skb);
6013                 input_queue_head_incr(oldsd);
6014         }
6015
6016         return NOTIFY_OK;
6017 }
6018
6019
6020 /**
6021  *      netdev_increment_features - increment feature set by one
6022  *      @all: current feature set
6023  *      @one: new feature set
6024  *      @mask: mask feature set
6025  *
6026  *      Computes a new feature set after adding a device with feature set
6027  *      @one to the master device with current feature set @all.  Will not
6028  *      enable anything that is off in @mask. Returns the new feature set.
6029  */
6030 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6031                                         unsigned long mask)
6032 {
6033         /* If device needs checksumming, downgrade to it. */
6034         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6035                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6036         else if (mask & NETIF_F_ALL_CSUM) {
6037                 /* If one device supports v4/v6 checksumming, set for all. */
6038                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6039                     !(all & NETIF_F_GEN_CSUM)) {
6040                         all &= ~NETIF_F_ALL_CSUM;
6041                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6042                 }
6043
6044                 /* If one device supports hw checksumming, set for all. */
6045                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6046                         all &= ~NETIF_F_ALL_CSUM;
6047                         all |= NETIF_F_HW_CSUM;
6048                 }
6049         }
6050
6051         one |= NETIF_F_ALL_CSUM;
6052
6053         one |= all & NETIF_F_ONE_FOR_ALL;
6054         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6055         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6056
6057         return all;
6058 }
6059 EXPORT_SYMBOL(netdev_increment_features);
6060
6061 static struct hlist_head *netdev_create_hash(void)
6062 {
6063         int i;
6064         struct hlist_head *hash;
6065
6066         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6067         if (hash != NULL)
6068                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6069                         INIT_HLIST_HEAD(&hash[i]);
6070
6071         return hash;
6072 }
6073
6074 /* Initialize per network namespace state */
6075 static int __net_init netdev_init(struct net *net)
6076 {
6077         INIT_LIST_HEAD(&net->dev_base_head);
6078
6079         net->dev_name_head = netdev_create_hash();
6080         if (net->dev_name_head == NULL)
6081                 goto err_name;
6082
6083         net->dev_index_head = netdev_create_hash();
6084         if (net->dev_index_head == NULL)
6085                 goto err_idx;
6086
6087         return 0;
6088
6089 err_idx:
6090         kfree(net->dev_name_head);
6091 err_name:
6092         return -ENOMEM;
6093 }
6094
6095 /**
6096  *      netdev_drivername - network driver for the device
6097  *      @dev: network device
6098  *      @buffer: buffer for resulting name
6099  *      @len: size of buffer
6100  *
6101  *      Determine network driver for device.
6102  */
6103 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6104 {
6105         const struct device_driver *driver;
6106         const struct device *parent;
6107
6108         if (len <= 0 || !buffer)
6109                 return buffer;
6110         buffer[0] = 0;
6111
6112         parent = dev->dev.parent;
6113
6114         if (!parent)
6115                 return buffer;
6116
6117         driver = parent->driver;
6118         if (driver && driver->name)
6119                 strlcpy(buffer, driver->name, len);
6120         return buffer;
6121 }
6122
6123 static int __netdev_printk(const char *level, const struct net_device *dev,
6124                            struct va_format *vaf)
6125 {
6126         int r;
6127
6128         if (dev && dev->dev.parent)
6129                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6130                                netdev_name(dev), vaf);
6131         else if (dev)
6132                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6133         else
6134                 r = printk("%s(NULL net_device): %pV", level, vaf);
6135
6136         return r;
6137 }
6138
6139 int netdev_printk(const char *level, const struct net_device *dev,
6140                   const char *format, ...)
6141 {
6142         struct va_format vaf;
6143         va_list args;
6144         int r;
6145
6146         va_start(args, format);
6147
6148         vaf.fmt = format;
6149         vaf.va = &args;
6150
6151         r = __netdev_printk(level, dev, &vaf);
6152         va_end(args);
6153
6154         return r;
6155 }
6156 EXPORT_SYMBOL(netdev_printk);
6157
6158 #define define_netdev_printk_level(func, level)                 \
6159 int func(const struct net_device *dev, const char *fmt, ...)    \
6160 {                                                               \
6161         int r;                                                  \
6162         struct va_format vaf;                                   \
6163         va_list args;                                           \
6164                                                                 \
6165         va_start(args, fmt);                                    \
6166                                                                 \
6167         vaf.fmt = fmt;                                          \
6168         vaf.va = &args;                                         \
6169                                                                 \
6170         r = __netdev_printk(level, dev, &vaf);                  \
6171         va_end(args);                                           \
6172                                                                 \
6173         return r;                                               \
6174 }                                                               \
6175 EXPORT_SYMBOL(func);
6176
6177 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6178 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6179 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6180 define_netdev_printk_level(netdev_err, KERN_ERR);
6181 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6182 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6183 define_netdev_printk_level(netdev_info, KERN_INFO);
6184
6185 static void __net_exit netdev_exit(struct net *net)
6186 {
6187         kfree(net->dev_name_head);
6188         kfree(net->dev_index_head);
6189 }
6190
6191 static struct pernet_operations __net_initdata netdev_net_ops = {
6192         .init = netdev_init,
6193         .exit = netdev_exit,
6194 };
6195
6196 static void __net_exit default_device_exit(struct net *net)
6197 {
6198         struct net_device *dev, *aux;
6199         /*
6200          * Push all migratable network devices back to the
6201          * initial network namespace
6202          */
6203         rtnl_lock();
6204         for_each_netdev_safe(net, dev, aux) {
6205                 int err;
6206                 char fb_name[IFNAMSIZ];
6207
6208                 /* Ignore unmoveable devices (i.e. loopback) */
6209                 if (dev->features & NETIF_F_NETNS_LOCAL)
6210                         continue;
6211
6212                 /* Leave virtual devices for the generic cleanup */
6213                 if (dev->rtnl_link_ops)
6214                         continue;
6215
6216                 /* Push remaing network devices to init_net */
6217                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6218                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6219                 if (err) {
6220                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6221                                 __func__, dev->name, err);
6222                         BUG();
6223                 }
6224         }
6225         rtnl_unlock();
6226 }
6227
6228 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6229 {
6230         /* At exit all network devices most be removed from a network
6231          * namespace.  Do this in the reverse order of registeration.
6232          * Do this across as many network namespaces as possible to
6233          * improve batching efficiency.
6234          */
6235         struct net_device *dev;
6236         struct net *net;
6237         LIST_HEAD(dev_kill_list);
6238
6239         rtnl_lock();
6240         list_for_each_entry(net, net_list, exit_list) {
6241                 for_each_netdev_reverse(net, dev) {
6242                         if (dev->rtnl_link_ops)
6243                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6244                         else
6245                                 unregister_netdevice_queue(dev, &dev_kill_list);
6246                 }
6247         }
6248         unregister_netdevice_many(&dev_kill_list);
6249         rtnl_unlock();
6250 }
6251
6252 static struct pernet_operations __net_initdata default_device_ops = {
6253         .exit = default_device_exit,
6254         .exit_batch = default_device_exit_batch,
6255 };
6256
6257 /*
6258  *      Initialize the DEV module. At boot time this walks the device list and
6259  *      unhooks any devices that fail to initialise (normally hardware not
6260  *      present) and leaves us with a valid list of present and active devices.
6261  *
6262  */
6263
6264 /*
6265  *       This is called single threaded during boot, so no need
6266  *       to take the rtnl semaphore.
6267  */
6268 static int __init net_dev_init(void)
6269 {
6270         int i, rc = -ENOMEM;
6271
6272         BUG_ON(!dev_boot_phase);
6273
6274         if (dev_proc_init())
6275                 goto out;
6276
6277         if (netdev_kobject_init())
6278                 goto out;
6279
6280         INIT_LIST_HEAD(&ptype_all);
6281         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6282                 INIT_LIST_HEAD(&ptype_base[i]);
6283
6284         if (register_pernet_subsys(&netdev_net_ops))
6285                 goto out;
6286
6287         /*
6288          *      Initialise the packet receive queues.
6289          */
6290
6291         for_each_possible_cpu(i) {
6292                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6293
6294                 memset(sd, 0, sizeof(*sd));
6295                 skb_queue_head_init(&sd->input_pkt_queue);
6296                 skb_queue_head_init(&sd->process_queue);
6297                 sd->completion_queue = NULL;
6298                 INIT_LIST_HEAD(&sd->poll_list);
6299                 sd->output_queue = NULL;
6300                 sd->output_queue_tailp = &sd->output_queue;
6301 #ifdef CONFIG_RPS
6302                 sd->csd.func = rps_trigger_softirq;
6303                 sd->csd.info = sd;
6304                 sd->csd.flags = 0;
6305                 sd->cpu = i;
6306 #endif
6307
6308                 sd->backlog.poll = process_backlog;
6309                 sd->backlog.weight = weight_p;
6310                 sd->backlog.gro_list = NULL;
6311                 sd->backlog.gro_count = 0;
6312         }
6313
6314         dev_boot_phase = 0;
6315
6316         /* The loopback device is special if any other network devices
6317          * is present in a network namespace the loopback device must
6318          * be present. Since we now dynamically allocate and free the
6319          * loopback device ensure this invariant is maintained by
6320          * keeping the loopback device as the first device on the
6321          * list of network devices.  Ensuring the loopback devices
6322          * is the first device that appears and the last network device
6323          * that disappears.
6324          */
6325         if (register_pernet_device(&loopback_net_ops))
6326                 goto out;
6327
6328         if (register_pernet_device(&default_device_ops))
6329                 goto out;
6330
6331         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6332         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6333
6334         hotcpu_notifier(dev_cpu_callback, 0);
6335         dst_init();
6336         dev_mcast_init();
6337         rc = 0;
6338 out:
6339         return rc;
6340 }
6341
6342 subsys_initcall(net_dev_init);
6343
6344 static int __init initialize_hashrnd(void)
6345 {
6346         get_random_bytes(&hashrnd, sizeof(hashrnd));
6347         return 0;
6348 }
6349
6350 late_initcall_sync(initialize_hashrnd);
6351