net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115 #include <linux/highmem.h>
 116 #include <linux/user_namespace.h>
 117 #include <linux/static_key.h>
 118 #include <linux/memcontrol.h>
 119 #include <linux/prefetch.h>
 120
 121 #include <asm/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134
 135 #include <linux/filter.h>
 136
 137 #include <trace/events/sock.h>
 138
 139 #ifdef CONFIG_INET
 140 #include <net/tcp.h>
 141 #endif
 142
 143 #include <net/busy_poll.h>
 144
 145 static DEFINE_MUTEX(proto_list_mutex);
 146 static LIST_HEAD(proto_list);
 147
 148 /**
 149  * sk_ns_capable - General socket capability test
 150  * @sk: Socket to use a capability on or through
 151  * @user_ns: The user namespace of the capability to use
 152  * @cap: The capability to use
 153  *
 154  * Test to see if the opener of the socket had when the socket was
 155  * created and the current process has the capability @cap in the user
 156  * namespace @user_ns.
 157  */
 158 bool sk_ns_capable(const struct sock *sk,
 159                    struct user_namespace *user_ns, int cap)
 160 {
 161         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162                 ns_capable(user_ns, cap);
 163 }
 164 EXPORT_SYMBOL(sk_ns_capable);
 165
 166 /**
 167  * sk_capable - Socket global capability test
 168  * @sk: Socket to use a capability on or through
 169  * @cap: The global capbility to use
 170  *
 171  * Test to see if the opener of the socket had when the socket was
 172  * created and the current process has the capability @cap in all user
 173  * namespaces.
 174  */
 175 bool sk_capable(const struct sock *sk, int cap)
 176 {
 177         return sk_ns_capable(sk, &init_user_ns, cap);
 178 }
 179 EXPORT_SYMBOL(sk_capable);
 180
 181 /**
 182  * sk_net_capable - Network namespace socket capability test
 183  * @sk: Socket to use a capability on or through
 184  * @cap: The capability to use
 185  *
 186  * Test to see if the opener of the socket had when the socke was created
 187  * and the current process has the capability @cap over the network namespace
 188  * the socket is a member of.
 189  */
 190 bool sk_net_capable(const struct sock *sk, int cap)
 191 {
 192         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193 }
 194 EXPORT_SYMBOL(sk_net_capable);
 195
 196
 197 #ifdef CONFIG_MEMCG_KMEM
 198 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 199 {
 200         struct proto *proto;
 201         int ret = 0;
 202
 203         mutex_lock(&proto_list_mutex);
 204         list_for_each_entry(proto, &proto_list, node) {
 205                 if (proto->init_cgroup) {
 206                         ret = proto->init_cgroup(memcg, ss);
 207                         if (ret)
 208                                 goto out;
 209                 }
 210         }
 211
 212         mutex_unlock(&proto_list_mutex);
 213         return ret;
 214 out:
 215         list_for_each_entry_continue_reverse(proto, &proto_list, node)
 216                 if (proto->destroy_cgroup)
 217                         proto->destroy_cgroup(memcg);
 218         mutex_unlock(&proto_list_mutex);
 219         return ret;
 220 }
 221
 222 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 223 {
 224         struct proto *proto;
 225
 226         mutex_lock(&proto_list_mutex);
 227         list_for_each_entry_reverse(proto, &proto_list, node)
 228                 if (proto->destroy_cgroup)
 229                         proto->destroy_cgroup(memcg);
 230         mutex_unlock(&proto_list_mutex);
 231 }
 232 #endif
 233
 234 /*
 235  * Each address family might have different locking rules, so we have
 236  * one slock key per address family:
 237  */
 238 static struct lock_class_key af_family_keys[AF_MAX];
 239 static struct lock_class_key af_family_slock_keys[AF_MAX];
 240
 241 #if defined(CONFIG_MEMCG_KMEM)
 242 struct static_key memcg_socket_limit_enabled;
 243 EXPORT_SYMBOL(memcg_socket_limit_enabled);
 244 #endif
 245
 246 /*
 247  * Make lock validator output more readable. (we pre-construct these
 248  * strings build-time, so that runtime initialization of socket
 249  * locks is fast):
 250  */
 251 static const char *const af_family_key_strings[AF_MAX+1] = {
 252   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 253   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 254   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 255   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 256   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 257   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 258   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 259   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 260   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 261   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 262   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 263   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 264   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 265   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 266 };
 267 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 268   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 269   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 270   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 271   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 272   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 273   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 274   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 275   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 276   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 277   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 278   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 279   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 280   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 281   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 282 };
 283 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 284   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 285   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 286   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 287   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 288   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 289   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 290   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 291   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 292   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 293   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 294   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 295   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 296   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 297   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 298 };
 299
 300 /*
 301  * sk_callback_lock locking rules are per-address-family,
 302  * so split the lock classes by using a per-AF key:
 303  */
 304 static struct lock_class_key af_callback_keys[AF_MAX];
 305
 306 /* Take into consideration the size of the struct sk_buff overhead in the
 307  * determination of these values, since that is non-constant across
 308  * platforms.  This makes socket queueing behavior and performance
 309  * not depend upon such differences.
 310  */
 311 #define _SK_MEM_PACKETS         256
 312 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 313 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 314 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 315
 316 /* Run time adjustable parameters. */
 317 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 318 EXPORT_SYMBOL(sysctl_wmem_max);
 319 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 320 EXPORT_SYMBOL(sysctl_rmem_max);
 321 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 322 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 323
 324 /* Maximal space eaten by iovec or ancillary data plus some space */
 325 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 326 EXPORT_SYMBOL(sysctl_optmem_max);
 327
 328 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 329 EXPORT_SYMBOL_GPL(memalloc_socks);
 330
 331 /**
 332  * sk_set_memalloc - sets %SOCK_MEMALLOC
 333  * @sk: socket to set it on
 334  *
 335  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 336  * It's the responsibility of the admin to adjust min_free_kbytes
 337  * to meet the requirements
 338  */
 339 void sk_set_memalloc(struct sock *sk)
 340 {
 341         sock_set_flag(sk, SOCK_MEMALLOC);
 342         sk->sk_allocation |= __GFP_MEMALLOC;
 343         static_key_slow_inc(&memalloc_socks);
 344 }
 345 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 346
 347 void sk_clear_memalloc(struct sock *sk)
 348 {
 349         sock_reset_flag(sk, SOCK_MEMALLOC);
 350         sk->sk_allocation &= ~__GFP_MEMALLOC;
 351         static_key_slow_dec(&memalloc_socks);
 352
 353         /*
 354          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 355          * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 356          * it has rmem allocations there is a risk that the user of the
 357          * socket cannot make forward progress due to exceeding the rmem
 358          * limits. By rights, sk_clear_memalloc() should only be called
 359          * on sockets being torn down but warn and reset the accounting if
 360          * that assumption breaks.
 361          */
 362         if (WARN_ON(sk->sk_forward_alloc))
 363                 sk_mem_reclaim(sk);
 364 }
 365 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 366
 367 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 368 {
 369         int ret;
 370         unsigned long pflags = current->flags;
 371
 372         /* these should have been dropped before queueing */
 373         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 374
 375         current->flags |= PF_MEMALLOC;
 376         ret = sk->sk_backlog_rcv(sk, skb);
 377         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 378
 379         return ret;
 380 }
 381 EXPORT_SYMBOL(__sk_backlog_rcv);
 382
 383 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 384 {
 385         struct timeval tv;
 386
 387         if (optlen < sizeof(tv))
 388                 return -EINVAL;
 389         if (copy_from_user(&tv, optval, sizeof(tv)))
 390                 return -EFAULT;
 391         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 392                 return -EDOM;
 393
 394         if (tv.tv_sec < 0) {
 395                 static int warned __read_mostly;
 396
 397                 *timeo_p = 0;
 398                 if (warned < 10 && net_ratelimit()) {
 399                         warned++;
 400                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 401                                 __func__, current->comm, task_pid_nr(current));
 402                 }
 403                 return 0;
 404         }
 405         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 406         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 407                 return 0;
 408         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 409                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 410         return 0;
 411 }
 412
 413 static void sock_warn_obsolete_bsdism(const char *name)
 414 {
 415         static int warned;
 416         static char warncomm[TASK_COMM_LEN];
 417         if (strcmp(warncomm, current->comm) && warned < 5) {
 418                 strcpy(warncomm,  current->comm);
 419                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 420                         warncomm, name);
 421                 warned++;
 422         }
 423 }
 424
 425 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 426
 427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 428 {
 429         if (sk->sk_flags & flags) {
 430                 sk->sk_flags &= ~flags;
 431                 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 432                         net_disable_timestamp();
 433         }
 434 }
 435
 436
 437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 438 {
 439         int err;
 440         int skb_len;
 441         unsigned long flags;
 442         struct sk_buff_head *list = &sk->sk_receive_queue;
 443
 444         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 445                 atomic_inc(&sk->sk_drops);
 446                 trace_sock_rcvqueue_full(sk, skb);
 447                 return -ENOMEM;
 448         }
 449
 450         err = sk_filter(sk, skb);
 451         if (err)
 452                 return err;
 453
 454         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 455                 atomic_inc(&sk->sk_drops);
 456                 return -ENOBUFS;
 457         }
 458
 459         skb->dev = NULL;
 460         skb_set_owner_r(skb, sk);
 461
 462         /* Cache the SKB length before we tack it onto the receive
 463          * queue.  Once it is added it no longer belongs to us and
 464          * may be freed by other threads of control pulling packets
 465          * from the queue.
 466          */
 467         skb_len = skb->len;
 468
 469         /* we escape from rcu protected region, make sure we dont leak
 470          * a norefcounted dst
 471          */
 472         skb_dst_force(skb);
 473
 474         spin_lock_irqsave(&list->lock, flags);
 475         skb->dropcount = atomic_read(&sk->sk_drops);
 476         __skb_queue_tail(list, skb);
 477         spin_unlock_irqrestore(&list->lock, flags);
 478
 479         if (!sock_flag(sk, SOCK_DEAD))
 480                 sk->sk_data_ready(sk);
 481         return 0;
 482 }
 483 EXPORT_SYMBOL(sock_queue_rcv_skb);
 484
 485 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 486 {
 487         int rc = NET_RX_SUCCESS;
 488
 489         if (sk_filter(sk, skb))
 490                 goto discard_and_relse;
 491
 492         skb->dev = NULL;
 493
 494         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 495                 atomic_inc(&sk->sk_drops);
 496                 goto discard_and_relse;
 497         }
 498         if (nested)
 499                 bh_lock_sock_nested(sk);
 500         else
 501                 bh_lock_sock(sk);
 502         if (!sock_owned_by_user(sk)) {
 503                 /*
 504                  * trylock + unlock semantics:
 505                  */
 506                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 507
 508                 rc = sk_backlog_rcv(sk, skb);
 509
 510                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 511         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 512                 bh_unlock_sock(sk);
 513                 atomic_inc(&sk->sk_drops);
 514                 goto discard_and_relse;
 515         }
 516
 517         bh_unlock_sock(sk);
 518 out:
 519         sock_put(sk);
 520         return rc;
 521 discard_and_relse:
 522         kfree_skb(skb);
 523         goto out;
 524 }
 525 EXPORT_SYMBOL(sk_receive_skb);
 526
 527 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 528 {
 529         struct dst_entry *dst = __sk_dst_get(sk);
 530
 531         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 532                 sk_tx_queue_clear(sk);
 533                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 534                 dst_release(dst);
 535                 return NULL;
 536         }
 537
 538         return dst;
 539 }
 540 EXPORT_SYMBOL(__sk_dst_check);
 541
 542 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 543 {
 544         struct dst_entry *dst = sk_dst_get(sk);
 545
 546         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 547                 sk_dst_reset(sk);
 548                 dst_release(dst);
 549                 return NULL;
 550         }
 551
 552         return dst;
 553 }
 554 EXPORT_SYMBOL(sk_dst_check);
 555
 556 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 557                                 int optlen)
 558 {
 559         int ret = -ENOPROTOOPT;
 560 #ifdef CONFIG_NETDEVICES
 561         struct net *net = sock_net(sk);
 562         char devname[IFNAMSIZ];
 563         int index;
 564
 565         /* Sorry... */
 566         ret = -EPERM;
 567         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 568                 goto out;
 569
 570         ret = -EINVAL;
 571         if (optlen < 0)
 572                 goto out;
 573
 574         /* Bind this socket to a particular device like "eth0",
 575          * as specified in the passed interface name. If the
 576          * name is "" or the option length is zero the socket
 577          * is not bound.
 578          */
 579         if (optlen > IFNAMSIZ - 1)
 580                 optlen = IFNAMSIZ - 1;
 581         memset(devname, 0, sizeof(devname));
 582
 583         ret = -EFAULT;
 584         if (copy_from_user(devname, optval, optlen))
 585                 goto out;
 586
 587         index = 0;
 588         if (devname[0] != '\0') {
 589                 struct net_device *dev;
 590
 591                 rcu_read_lock();
 592                 dev = dev_get_by_name_rcu(net, devname);
 593                 if (dev)
 594                         index = dev->ifindex;
 595                 rcu_read_unlock();
 596                 ret = -ENODEV;
 597                 if (!dev)
 598                         goto out;
 599         }
 600
 601         lock_sock(sk);
 602         sk->sk_bound_dev_if = index;
 603         sk_dst_reset(sk);
 604         release_sock(sk);
 605
 606         ret = 0;
 607
 608 out:
 609 #endif
 610
 611         return ret;
 612 }
 613
 614 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 615                                 int __user *optlen, int len)
 616 {
 617         int ret = -ENOPROTOOPT;
 618 #ifdef CONFIG_NETDEVICES
 619         struct net *net = sock_net(sk);
 620         char devname[IFNAMSIZ];
 621
 622         if (sk->sk_bound_dev_if == 0) {
 623                 len = 0;
 624                 goto zero;
 625         }
 626
 627         ret = -EINVAL;
 628         if (len < IFNAMSIZ)
 629                 goto out;
 630
 631         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 632         if (ret)
 633                 goto out;
 634
 635         len = strlen(devname) + 1;
 636
 637         ret = -EFAULT;
 638         if (copy_to_user(optval, devname, len))
 639                 goto out;
 640
 641 zero:
 642         ret = -EFAULT;
 643         if (put_user(len, optlen))
 644                 goto out;
 645
 646         ret = 0;
 647
 648 out:
 649 #endif
 650
 651         return ret;
 652 }
 653
 654 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 655 {
 656         if (valbool)
 657                 sock_set_flag(sk, bit);
 658         else
 659                 sock_reset_flag(sk, bit);
 660 }
 661
 662 /*
 663  *      This is meant for all protocols to use and covers goings on
 664  *      at the socket level. Everything here is generic.
 665  */
 666
 667 int sock_setsockopt(struct socket *sock, int level, int optname,
 668                     char __user *optval, unsigned int optlen)
 669 {
 670         struct sock *sk = sock->sk;
 671         int val;
 672         int valbool;
 673         struct linger ling;
 674         int ret = 0;
 675
 676         /*
 677          *      Options without arguments
 678          */
 679
 680         if (optname == SO_BINDTODEVICE)
 681                 return sock_setbindtodevice(sk, optval, optlen);
 682
 683         if (optlen < sizeof(int))
 684                 return -EINVAL;
 685
 686         if (get_user(val, (int __user *)optval))
 687                 return -EFAULT;
 688
 689         valbool = val ? 1 : 0;
 690
 691         lock_sock(sk);
 692
 693         switch (optname) {
 694         case SO_DEBUG:
 695                 if (val && !capable(CAP_NET_ADMIN))
 696                         ret = -EACCES;
 697                 else
 698                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 699                 break;
 700         case SO_REUSEADDR:
 701                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 702                 break;
 703         case SO_REUSEPORT:
 704                 sk->sk_reuseport = valbool;
 705                 break;
 706         case SO_TYPE:
 707         case SO_PROTOCOL:
 708         case SO_DOMAIN:
 709         case SO_ERROR:
 710                 ret = -ENOPROTOOPT;
 711                 break;
 712         case SO_DONTROUTE:
 713                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 714                 break;
 715         case SO_BROADCAST:
 716                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 717                 break;
 718         case SO_SNDBUF:
 719                 /* Don't error on this BSD doesn't and if you think
 720                  * about it this is right. Otherwise apps have to
 721                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 722                  * are treated in BSD as hints
 723                  */
 724                 val = min_t(u32, val, sysctl_wmem_max);
 725 set_sndbuf:
 726                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 727                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 728                 /* Wake up sending tasks if we upped the value. */
 729                 sk->sk_write_space(sk);
 730                 break;
 731
 732         case SO_SNDBUFFORCE:
 733                 if (!capable(CAP_NET_ADMIN)) {
 734                         ret = -EPERM;
 735                         break;
 736                 }
 737                 goto set_sndbuf;
 738
 739         case SO_RCVBUF:
 740                 /* Don't error on this BSD doesn't and if you think
 741                  * about it this is right. Otherwise apps have to
 742                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 743                  * are treated in BSD as hints
 744                  */
 745                 val = min_t(u32, val, sysctl_rmem_max);
 746 set_rcvbuf:
 747                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 748                 /*
 749                  * We double it on the way in to account for
 750                  * "struct sk_buff" etc. overhead.   Applications
 751                  * assume that the SO_RCVBUF setting they make will
 752                  * allow that much actual data to be received on that
 753                  * socket.
 754                  *
 755                  * Applications are unaware that "struct sk_buff" and
 756                  * other overheads allocate from the receive buffer
 757                  * during socket buffer allocation.
 758                  *
 759                  * And after considering the possible alternatives,
 760                  * returning the value we actually used in getsockopt
 761                  * is the most desirable behavior.
 762                  */
 763                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 764                 break;
 765
 766         case SO_RCVBUFFORCE:
 767                 if (!capable(CAP_NET_ADMIN)) {
 768                         ret = -EPERM;
 769                         break;
 770                 }
 771                 goto set_rcvbuf;
 772
 773         case SO_KEEPALIVE:
 774 #ifdef CONFIG_INET
 775                 if (sk->sk_protocol == IPPROTO_TCP &&
 776                     sk->sk_type == SOCK_STREAM)
 777                         tcp_set_keepalive(sk, valbool);
 778 #endif
 779                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 780                 break;
 781
 782         case SO_OOBINLINE:
 783                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 784                 break;
 785
 786         case SO_NO_CHECK:
 787                 sk->sk_no_check_tx = valbool;
 788                 break;
 789
 790         case SO_PRIORITY:
 791                 if ((val >= 0 && val <= 6) ||
 792                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 793                         sk->sk_priority = val;
 794                 else
 795                         ret = -EPERM;
 796                 break;
 797
 798         case SO_LINGER:
 799                 if (optlen < sizeof(ling)) {
 800                         ret = -EINVAL;  /* 1003.1g */
 801                         break;
 802                 }
 803                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 804                         ret = -EFAULT;
 805                         break;
 806                 }
 807                 if (!ling.l_onoff)
 808                         sock_reset_flag(sk, SOCK_LINGER);
 809                 else {
 810 #if (BITS_PER_LONG == 32)
 811                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 812                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 813                         else
 814 #endif
 815                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 816                         sock_set_flag(sk, SOCK_LINGER);
 817                 }
 818                 break;
 819
 820         case SO_BSDCOMPAT:
 821                 sock_warn_obsolete_bsdism("setsockopt");
 822                 break;
 823
 824         case SO_PASSCRED:
 825                 if (valbool)
 826                         set_bit(SOCK_PASSCRED, &sock->flags);
 827                 else
 828                         clear_bit(SOCK_PASSCRED, &sock->flags);
 829                 break;
 830
 831         case SO_TIMESTAMP:
 832         case SO_TIMESTAMPNS:
 833                 if (valbool)  {
 834                         if (optname == SO_TIMESTAMP)
 835                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 836                         else
 837                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 838                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 839                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 840                 } else {
 841                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 842                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 843                 }
 844                 break;
 845
 846         case SO_TIMESTAMPING:
 847                 if (val & ~SOF_TIMESTAMPING_MASK) {
 848                         ret = -EINVAL;
 849                         break;
 850                 }
 851                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 852                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID))
 853                         sk->sk_tskey = 0;
 854                 sk->sk_tsflags = val;
 855                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 856                         sock_enable_timestamp(sk,
 857                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 858                 else
 859                         sock_disable_timestamp(sk,
 860                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 861                 break;
 862
 863         case SO_RCVLOWAT:
 864                 if (val < 0)
 865                         val = INT_MAX;
 866                 sk->sk_rcvlowat = val ? : 1;
 867                 break;
 868
 869         case SO_RCVTIMEO:
 870                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 871                 break;
 872
 873         case SO_SNDTIMEO:
 874                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 875                 break;
 876
 877         case SO_ATTACH_FILTER:
 878                 ret = -EINVAL;
 879                 if (optlen == sizeof(struct sock_fprog)) {
 880                         struct sock_fprog fprog;
 881
 882                         ret = -EFAULT;
 883                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 884                                 break;
 885
 886                         ret = sk_attach_filter(&fprog, sk);
 887                 }
 888                 break;
 889
 890         case SO_DETACH_FILTER:
 891                 ret = sk_detach_filter(sk);
 892                 break;
 893
 894         case SO_LOCK_FILTER:
 895                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 896                         ret = -EPERM;
 897                 else
 898                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 899                 break;
 900
 901         case SO_PASSSEC:
 902                 if (valbool)
 903                         set_bit(SOCK_PASSSEC, &sock->flags);
 904                 else
 905                         clear_bit(SOCK_PASSSEC, &sock->flags);
 906                 break;
 907         case SO_MARK:
 908                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 909                         ret = -EPERM;
 910                 else
 911                         sk->sk_mark = val;
 912                 break;
 913
 914                 /* We implement the SO_SNDLOWAT etc to
 915                    not be settable (1003.1g 5.3) */
 916         case SO_RXQ_OVFL:
 917                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 918                 break;
 919
 920         case SO_WIFI_STATUS:
 921                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 922                 break;
 923
 924         case SO_PEEK_OFF:
 925                 if (sock->ops->set_peek_off)
 926                         ret = sock->ops->set_peek_off(sk, val);
 927                 else
 928                         ret = -EOPNOTSUPP;
 929                 break;
 930
 931         case SO_NOFCS:
 932                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 933                 break;
 934
 935         case SO_SELECT_ERR_QUEUE:
 936                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 937                 break;
 938
 939 #ifdef CONFIG_NET_RX_BUSY_POLL
 940         case SO_BUSY_POLL:
 941                 /* allow unprivileged users to decrease the value */
 942                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 943                         ret = -EPERM;
 944                 else {
 945                         if (val < 0)
 946                                 ret = -EINVAL;
 947                         else
 948                                 sk->sk_ll_usec = val;
 949                 }
 950                 break;
 951 #endif
 952
 953         case SO_MAX_PACING_RATE:
 954                 sk->sk_max_pacing_rate = val;
 955                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 956                                          sk->sk_max_pacing_rate);
 957                 break;
 958
 959         default:
 960                 ret = -ENOPROTOOPT;
 961                 break;
 962         }
 963         release_sock(sk);
 964         return ret;
 965 }
 966 EXPORT_SYMBOL(sock_setsockopt);
 967
 968
 969 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
 970                           struct ucred *ucred)
 971 {
 972         ucred->pid = pid_vnr(pid);
 973         ucred->uid = ucred->gid = -1;
 974         if (cred) {
 975                 struct user_namespace *current_ns = current_user_ns();
 976
 977                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
 978                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
 979         }
 980 }
 981
 982 int sock_getsockopt(struct socket *sock, int level, int optname,
 983                     char __user *optval, int __user *optlen)
 984 {
 985         struct sock *sk = sock->sk;
 986
 987         union {
 988                 int val;
 989                 struct linger ling;
 990                 struct timeval tm;
 991         } v;
 992
 993         int lv = sizeof(int);
 994         int len;
 995
 996         if (get_user(len, optlen))
 997                 return -EFAULT;
 998         if (len < 0)
 999                 return -EINVAL;
1000
1001         memset(&v, 0, sizeof(v));
1002
1003         switch (optname) {
1004         case SO_DEBUG:
1005                 v.val = sock_flag(sk, SOCK_DBG);
1006                 break;
1007
1008         case SO_DONTROUTE:
1009                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1010                 break;
1011
1012         case SO_BROADCAST:
1013                 v.val = sock_flag(sk, SOCK_BROADCAST);
1014                 break;
1015
1016         case SO_SNDBUF:
1017                 v.val = sk->sk_sndbuf;
1018                 break;
1019
1020         case SO_RCVBUF:
1021                 v.val = sk->sk_rcvbuf;
1022                 break;
1023
1024         case SO_REUSEADDR:
1025                 v.val = sk->sk_reuse;
1026                 break;
1027
1028         case SO_REUSEPORT:
1029                 v.val = sk->sk_reuseport;
1030                 break;
1031
1032         case SO_KEEPALIVE:
1033                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1034                 break;
1035
1036         case SO_TYPE:
1037                 v.val = sk->sk_type;
1038                 break;
1039
1040         case SO_PROTOCOL:
1041                 v.val = sk->sk_protocol;
1042                 break;
1043
1044         case SO_DOMAIN:
1045                 v.val = sk->sk_family;
1046                 break;
1047
1048         case SO_ERROR:
1049                 v.val = -sock_error(sk);
1050                 if (v.val == 0)
1051                         v.val = xchg(&sk->sk_err_soft, 0);
1052                 break;
1053
1054         case SO_OOBINLINE:
1055                 v.val = sock_flag(sk, SOCK_URGINLINE);
1056                 break;
1057
1058         case SO_NO_CHECK:
1059                 v.val = sk->sk_no_check_tx;
1060                 break;
1061
1062         case SO_PRIORITY:
1063                 v.val = sk->sk_priority;
1064                 break;
1065
1066         case SO_LINGER:
1067                 lv              = sizeof(v.ling);
1068                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1069                 v.ling.l_linger = sk->sk_lingertime / HZ;
1070                 break;
1071
1072         case SO_BSDCOMPAT:
1073                 sock_warn_obsolete_bsdism("getsockopt");
1074                 break;
1075
1076         case SO_TIMESTAMP:
1077                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1078                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1079                 break;
1080
1081         case SO_TIMESTAMPNS:
1082                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1083                 break;
1084
1085         case SO_TIMESTAMPING:
1086                 v.val = sk->sk_tsflags;
1087                 break;
1088
1089         case SO_RCVTIMEO:
1090                 lv = sizeof(struct timeval);
1091                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1092                         v.tm.tv_sec = 0;
1093                         v.tm.tv_usec = 0;
1094                 } else {
1095                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1096                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1097                 }
1098                 break;
1099
1100         case SO_SNDTIMEO:
1101                 lv = sizeof(struct timeval);
1102                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1103                         v.tm.tv_sec = 0;
1104                         v.tm.tv_usec = 0;
1105                 } else {
1106                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1107                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1108                 }
1109                 break;
1110
1111         case SO_RCVLOWAT:
1112                 v.val = sk->sk_rcvlowat;
1113                 break;
1114
1115         case SO_SNDLOWAT:
1116                 v.val = 1;
1117                 break;
1118
1119         case SO_PASSCRED:
1120                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1121                 break;
1122
1123         case SO_PEERCRED:
1124         {
1125                 struct ucred peercred;
1126                 if (len > sizeof(peercred))
1127                         len = sizeof(peercred);
1128                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1129                 if (copy_to_user(optval, &peercred, len))
1130                         return -EFAULT;
1131                 goto lenout;
1132         }
1133
1134         case SO_PEERNAME:
1135         {
1136                 char address[128];
1137
1138                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1139                         return -ENOTCONN;
1140                 if (lv < len)
1141                         return -EINVAL;
1142                 if (copy_to_user(optval, address, len))
1143                         return -EFAULT;
1144                 goto lenout;
1145         }
1146
1147         /* Dubious BSD thing... Probably nobody even uses it, but
1148          * the UNIX standard wants it for whatever reason... -DaveM
1149          */
1150         case SO_ACCEPTCONN:
1151                 v.val = sk->sk_state == TCP_LISTEN;
1152                 break;
1153
1154         case SO_PASSSEC:
1155                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1156                 break;
1157
1158         case SO_PEERSEC:
1159                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1160
1161         case SO_MARK:
1162                 v.val = sk->sk_mark;
1163                 break;
1164
1165         case SO_RXQ_OVFL:
1166                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1167                 break;
1168
1169         case SO_WIFI_STATUS:
1170                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1171                 break;
1172
1173         case SO_PEEK_OFF:
1174                 if (!sock->ops->set_peek_off)
1175                         return -EOPNOTSUPP;
1176
1177                 v.val = sk->sk_peek_off;
1178                 break;
1179         case SO_NOFCS:
1180                 v.val = sock_flag(sk, SOCK_NOFCS);
1181                 break;
1182
1183         case SO_BINDTODEVICE:
1184                 return sock_getbindtodevice(sk, optval, optlen, len);
1185
1186         case SO_GET_FILTER:
1187                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1188                 if (len < 0)
1189                         return len;
1190
1191                 goto lenout;
1192
1193         case SO_LOCK_FILTER:
1194                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1195                 break;
1196
1197         case SO_BPF_EXTENSIONS:
1198                 v.val = bpf_tell_extensions();
1199                 break;
1200
1201         case SO_SELECT_ERR_QUEUE:
1202                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1203                 break;
1204
1205 #ifdef CONFIG_NET_RX_BUSY_POLL
1206         case SO_BUSY_POLL:
1207                 v.val = sk->sk_ll_usec;
1208                 break;
1209 #endif
1210
1211         case SO_MAX_PACING_RATE:
1212                 v.val = sk->sk_max_pacing_rate;
1213                 break;
1214
1215         default:
1216                 return -ENOPROTOOPT;
1217         }
1218
1219         if (len > lv)
1220                 len = lv;
1221         if (copy_to_user(optval, &v, len))
1222                 return -EFAULT;
1223 lenout:
1224         if (put_user(len, optlen))
1225                 return -EFAULT;
1226         return 0;
1227 }
1228
1229 /*
1230  * Initialize an sk_lock.
1231  *
1232  * (We also register the sk_lock with the lock validator.)
1233  */
1234 static inline void sock_lock_init(struct sock *sk)
1235 {
1236         sock_lock_init_class_and_name(sk,
1237                         af_family_slock_key_strings[sk->sk_family],
1238                         af_family_slock_keys + sk->sk_family,
1239                         af_family_key_strings[sk->sk_family],
1240                         af_family_keys + sk->sk_family);
1241 }
1242
1243 /*
1244  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1245  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1246  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1247  */
1248 static void sock_copy(struct sock *nsk, const struct sock *osk)
1249 {
1250 #ifdef CONFIG_SECURITY_NETWORK
1251         void *sptr = nsk->sk_security;
1252 #endif
1253         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1254
1255         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1256                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1257
1258 #ifdef CONFIG_SECURITY_NETWORK
1259         nsk->sk_security = sptr;
1260         security_sk_clone(osk, nsk);
1261 #endif
1262 }
1263
1264 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1265 {
1266         unsigned long nulls1, nulls2;
1267
1268         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1269         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1270         if (nulls1 > nulls2)
1271                 swap(nulls1, nulls2);
1272
1273         if (nulls1 != 0)
1274                 memset((char *)sk, 0, nulls1);
1275         memset((char *)sk + nulls1 + sizeof(void *), 0,
1276                nulls2 - nulls1 - sizeof(void *));
1277         memset((char *)sk + nulls2 + sizeof(void *), 0,
1278                size - nulls2 - sizeof(void *));
1279 }
1280 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1281
1282 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1283                 int family)
1284 {
1285         struct sock *sk;
1286         struct kmem_cache *slab;
1287
1288         slab = prot->slab;
1289         if (slab != NULL) {
1290                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1291                 if (!sk)
1292                         return sk;
1293                 if (priority & __GFP_ZERO) {
1294                         if (prot->clear_sk)
1295                                 prot->clear_sk(sk, prot->obj_size);
1296                         else
1297                                 sk_prot_clear_nulls(sk, prot->obj_size);
1298                 }
1299         } else
1300                 sk = kmalloc(prot->obj_size, priority);
1301
1302         if (sk != NULL) {
1303                 kmemcheck_annotate_bitfield(sk, flags);
1304
1305                 if (security_sk_alloc(sk, family, priority))
1306                         goto out_free;
1307
1308                 if (!try_module_get(prot->owner))
1309                         goto out_free_sec;
1310                 sk_tx_queue_clear(sk);
1311         }
1312
1313         return sk;
1314
1315 out_free_sec:
1316         security_sk_free(sk);
1317 out_free:
1318         if (slab != NULL)
1319                 kmem_cache_free(slab, sk);
1320         else
1321                 kfree(sk);
1322         return NULL;
1323 }
1324
1325 static void sk_prot_free(struct proto *prot, struct sock *sk)
1326 {
1327         struct kmem_cache *slab;
1328         struct module *owner;
1329
1330         owner = prot->owner;
1331         slab = prot->slab;
1332
1333         security_sk_free(sk);
1334         if (slab != NULL)
1335                 kmem_cache_free(slab, sk);
1336         else
1337                 kfree(sk);
1338         module_put(owner);
1339 }
1340
1341 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
1342 void sock_update_netprioidx(struct sock *sk)
1343 {
1344         if (in_interrupt())
1345                 return;
1346
1347         sk->sk_cgrp_prioidx = task_netprioidx(current);
1348 }
1349 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1350 #endif
1351
1352 /**
1353  *      sk_alloc - All socket objects are allocated here
1354  *      @net: the applicable net namespace
1355  *      @family: protocol family
1356  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1357  *      @prot: struct proto associated with this new sock instance
1358  */
1359 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1360                       struct proto *prot)
1361 {
1362         struct sock *sk;
1363
1364         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1365         if (sk) {
1366                 sk->sk_family = family;
1367                 /*
1368                  * See comment in struct sock definition to understand
1369                  * why we need sk_prot_creator -acme
1370                  */
1371                 sk->sk_prot = sk->sk_prot_creator = prot;
1372                 sock_lock_init(sk);
1373                 sock_net_set(sk, get_net(net));
1374                 atomic_set(&sk->sk_wmem_alloc, 1);
1375
1376                 sock_update_classid(sk);
1377                 sock_update_netprioidx(sk);
1378         }
1379
1380         return sk;
1381 }
1382 EXPORT_SYMBOL(sk_alloc);
1383
1384 static void __sk_free(struct sock *sk)
1385 {
1386         struct sk_filter *filter;
1387
1388         if (sk->sk_destruct)
1389                 sk->sk_destruct(sk);
1390
1391         filter = rcu_dereference_check(sk->sk_filter,
1392                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1393         if (filter) {
1394                 sk_filter_uncharge(sk, filter);
1395                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1396         }
1397
1398         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1399
1400         if (atomic_read(&sk->sk_omem_alloc))
1401                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1402                          __func__, atomic_read(&sk->sk_omem_alloc));
1403
1404         if (sk->sk_peer_cred)
1405                 put_cred(sk->sk_peer_cred);
1406         put_pid(sk->sk_peer_pid);
1407         put_net(sock_net(sk));
1408         sk_prot_free(sk->sk_prot_creator, sk);
1409 }
1410
1411 void sk_free(struct sock *sk)
1412 {
1413         /*
1414          * We subtract one from sk_wmem_alloc and can know if
1415          * some packets are still in some tx queue.
1416          * If not null, sock_wfree() will call __sk_free(sk) later
1417          */
1418         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1419                 __sk_free(sk);
1420 }
1421 EXPORT_SYMBOL(sk_free);
1422
1423 /*
1424  * Last sock_put should drop reference to sk->sk_net. It has already
1425  * been dropped in sk_change_net. Taking reference to stopping namespace
1426  * is not an option.
1427  * Take reference to a socket to remove it from hash _alive_ and after that
1428  * destroy it in the context of init_net.
1429  */
1430 void sk_release_kernel(struct sock *sk)
1431 {
1432         if (sk == NULL || sk->sk_socket == NULL)
1433                 return;
1434
1435         sock_hold(sk);
1436         sock_release(sk->sk_socket);
1437         release_net(sock_net(sk));
1438         sock_net_set(sk, get_net(&init_net));
1439         sock_put(sk);
1440 }
1441 EXPORT_SYMBOL(sk_release_kernel);
1442
1443 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1444 {
1445         if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1446                 sock_update_memcg(newsk);
1447 }
1448
1449 /**
1450  *      sk_clone_lock - clone a socket, and lock its clone
1451  *      @sk: the socket to clone
1452  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1453  *
1454  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1455  */
1456 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1457 {
1458         struct sock *newsk;
1459         bool is_charged = true;
1460
1461         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1462         if (newsk != NULL) {
1463                 struct sk_filter *filter;
1464
1465                 sock_copy(newsk, sk);
1466
1467                 /* SANITY */
1468                 get_net(sock_net(newsk));
1469                 sk_node_init(&newsk->sk_node);
1470                 sock_lock_init(newsk);
1471                 bh_lock_sock(newsk);
1472                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1473                 newsk->sk_backlog.len = 0;
1474
1475                 atomic_set(&newsk->sk_rmem_alloc, 0);
1476                 /*
1477                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1478                  */
1479                 atomic_set(&newsk->sk_wmem_alloc, 1);
1480                 atomic_set(&newsk->sk_omem_alloc, 0);
1481                 skb_queue_head_init(&newsk->sk_receive_queue);
1482                 skb_queue_head_init(&newsk->sk_write_queue);
1483 #ifdef CONFIG_NET_DMA
1484                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1485 #endif
1486
1487                 spin_lock_init(&newsk->sk_dst_lock);
1488                 rwlock_init(&newsk->sk_callback_lock);
1489                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1490                                 af_callback_keys + newsk->sk_family,
1491                                 af_family_clock_key_strings[newsk->sk_family]);
1492
1493                 newsk->sk_dst_cache     = NULL;
1494                 newsk->sk_wmem_queued   = 0;
1495                 newsk->sk_forward_alloc = 0;
1496                 newsk->sk_send_head     = NULL;
1497                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1498
1499                 sock_reset_flag(newsk, SOCK_DONE);
1500                 skb_queue_head_init(&newsk->sk_error_queue);
1501
1502                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1503                 if (filter != NULL)
1504                         /* though it's an empty new sock, the charging may fail
1505                          * if sysctl_optmem_max was changed between creation of
1506                          * original socket and cloning
1507                          */
1508                         is_charged = sk_filter_charge(newsk, filter);
1509
1510                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
1511                         /* It is still raw copy of parent, so invalidate
1512                          * destructor and make plain sk_free() */
1513                         newsk->sk_destruct = NULL;
1514                         bh_unlock_sock(newsk);
1515                         sk_free(newsk);
1516                         newsk = NULL;
1517                         goto out;
1518                 }
1519
1520                 newsk->sk_err      = 0;
1521                 newsk->sk_priority = 0;
1522                 /*
1523                  * Before updating sk_refcnt, we must commit prior changes to memory
1524                  * (Documentation/RCU/rculist_nulls.txt for details)
1525                  */
1526                 smp_wmb();
1527                 atomic_set(&newsk->sk_refcnt, 2);
1528
1529                 /*
1530                  * Increment the counter in the same struct proto as the master
1531                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1532                  * is the same as sk->sk_prot->socks, as this field was copied
1533                  * with memcpy).
1534                  *
1535                  * This _changes_ the previous behaviour, where
1536                  * tcp_create_openreq_child always was incrementing the
1537                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1538                  * to be taken into account in all callers. -acme
1539                  */
1540                 sk_refcnt_debug_inc(newsk);
1541                 sk_set_socket(newsk, NULL);
1542                 newsk->sk_wq = NULL;
1543
1544                 sk_update_clone(sk, newsk);
1545
1546                 if (newsk->sk_prot->sockets_allocated)
1547                         sk_sockets_allocated_inc(newsk);
1548
1549                 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1550                         net_enable_timestamp();
1551         }
1552 out:
1553         return newsk;
1554 }
1555 EXPORT_SYMBOL_GPL(sk_clone_lock);
1556
1557 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1558 {
1559         __sk_dst_set(sk, dst);
1560         sk->sk_route_caps = dst->dev->features;
1561         if (sk->sk_route_caps & NETIF_F_GSO)
1562                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1563         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1564         if (sk_can_gso(sk)) {
1565                 if (dst->header_len) {
1566                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1567                 } else {
1568                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1569                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1570                         sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1571                 }
1572         }
1573 }
1574 EXPORT_SYMBOL_GPL(sk_setup_caps);
1575
1576 /*
1577  *      Simple resource managers for sockets.
1578  */
1579
1580
1581 /*
1582  * Write buffer destructor automatically called from kfree_skb.
1583  */
1584 void sock_wfree(struct sk_buff *skb)
1585 {
1586         struct sock *sk = skb->sk;
1587         unsigned int len = skb->truesize;
1588
1589         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1590                 /*
1591                  * Keep a reference on sk_wmem_alloc, this will be released
1592                  * after sk_write_space() call
1593                  */
1594                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1595                 sk->sk_write_space(sk);
1596                 len = 1;
1597         }
1598         /*
1599          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1600          * could not do because of in-flight packets
1601          */
1602         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1603                 __sk_free(sk);
1604 }
1605 EXPORT_SYMBOL(sock_wfree);
1606
1607 void skb_orphan_partial(struct sk_buff *skb)
1608 {
1609         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1610          * so we do not completely orphan skb, but transfert all
1611          * accounted bytes but one, to avoid unexpected reorders.
1612          */
1613         if (skb->destructor == sock_wfree
1614 #ifdef CONFIG_INET
1615             || skb->destructor == tcp_wfree
1616 #endif
1617                 ) {
1618                 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1619                 skb->truesize = 1;
1620         } else {
1621                 skb_orphan(skb);
1622         }
1623 }
1624 EXPORT_SYMBOL(skb_orphan_partial);
1625
1626 /*
1627  * Read buffer destructor automatically called from kfree_skb.
1628  */
1629 void sock_rfree(struct sk_buff *skb)
1630 {
1631         struct sock *sk = skb->sk;
1632         unsigned int len = skb->truesize;
1633
1634         atomic_sub(len, &sk->sk_rmem_alloc);
1635         sk_mem_uncharge(sk, len);
1636 }
1637 EXPORT_SYMBOL(sock_rfree);
1638
1639 void sock_edemux(struct sk_buff *skb)
1640 {
1641         struct sock *sk = skb->sk;
1642
1643 #ifdef CONFIG_INET
1644         if (sk->sk_state == TCP_TIME_WAIT)
1645                 inet_twsk_put(inet_twsk(sk));
1646         else
1647 #endif
1648                 sock_put(sk);
1649 }
1650 EXPORT_SYMBOL(sock_edemux);
1651
1652 kuid_t sock_i_uid(struct sock *sk)
1653 {
1654         kuid_t uid;
1655
1656         read_lock_bh(&sk->sk_callback_lock);
1657         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1658         read_unlock_bh(&sk->sk_callback_lock);
1659         return uid;
1660 }
1661 EXPORT_SYMBOL(sock_i_uid);
1662
1663 unsigned long sock_i_ino(struct sock *sk)
1664 {
1665         unsigned long ino;
1666
1667         read_lock_bh(&sk->sk_callback_lock);
1668         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1669         read_unlock_bh(&sk->sk_callback_lock);
1670         return ino;
1671 }
1672 EXPORT_SYMBOL(sock_i_ino);
1673
1674 /*
1675  * Allocate a skb from the socket's send buffer.
1676  */
1677 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1678                              gfp_t priority)
1679 {
1680         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1681                 struct sk_buff *skb = alloc_skb(size, priority);
1682                 if (skb) {
1683                         skb_set_owner_w(skb, sk);
1684                         return skb;
1685                 }
1686         }
1687         return NULL;
1688 }
1689 EXPORT_SYMBOL(sock_wmalloc);
1690
1691 /*
1692  * Allocate a memory block from the socket's option memory buffer.
1693  */
1694 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1695 {
1696         if ((unsigned int)size <= sysctl_optmem_max &&
1697             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1698                 void *mem;
1699                 /* First do the add, to avoid the race if kmalloc
1700                  * might sleep.
1701                  */
1702                 atomic_add(size, &sk->sk_omem_alloc);
1703                 mem = kmalloc(size, priority);
1704                 if (mem)
1705                         return mem;
1706                 atomic_sub(size, &sk->sk_omem_alloc);
1707         }
1708         return NULL;
1709 }
1710 EXPORT_SYMBOL(sock_kmalloc);
1711
1712 /*
1713  * Free an option memory block.
1714  */
1715 void sock_kfree_s(struct sock *sk, void *mem, int size)
1716 {
1717         kfree(mem);
1718         atomic_sub(size, &sk->sk_omem_alloc);
1719 }
1720 EXPORT_SYMBOL(sock_kfree_s);
1721
1722 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1723    I think, these locks should be removed for datagram sockets.
1724  */
1725 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1726 {
1727         DEFINE_WAIT(wait);
1728
1729         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1730         for (;;) {
1731                 if (!timeo)
1732                         break;
1733                 if (signal_pending(current))
1734                         break;
1735                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1736                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1737                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1738                         break;
1739                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1740                         break;
1741                 if (sk->sk_err)
1742                         break;
1743                 timeo = schedule_timeout(timeo);
1744         }
1745         finish_wait(sk_sleep(sk), &wait);
1746         return timeo;
1747 }
1748
1749
1750 /*
1751  *      Generic send/receive buffer handlers
1752  */
1753
1754 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1755                                      unsigned long data_len, int noblock,
1756                                      int *errcode, int max_page_order)
1757 {
1758         struct sk_buff *skb = NULL;
1759         unsigned long chunk;
1760         gfp_t gfp_mask;
1761         long timeo;
1762         int err;
1763         int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1764         struct page *page;
1765         int i;
1766
1767         err = -EMSGSIZE;
1768         if (npages > MAX_SKB_FRAGS)
1769                 goto failure;
1770
1771         timeo = sock_sndtimeo(sk, noblock);
1772         while (!skb) {
1773                 err = sock_error(sk);
1774                 if (err != 0)
1775                         goto failure;
1776
1777                 err = -EPIPE;
1778                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1779                         goto failure;
1780
1781                 if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
1782                         set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1783                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1784                         err = -EAGAIN;
1785                         if (!timeo)
1786                                 goto failure;
1787                         if (signal_pending(current))
1788                                 goto interrupted;
1789                         timeo = sock_wait_for_wmem(sk, timeo);
1790                         continue;
1791                 }
1792
1793                 err = -ENOBUFS;
1794                 gfp_mask = sk->sk_allocation;
1795                 if (gfp_mask & __GFP_WAIT)
1796                         gfp_mask |= __GFP_REPEAT;
1797
1798                 skb = alloc_skb(header_len, gfp_mask);
1799                 if (!skb)
1800                         goto failure;
1801
1802                 skb->truesize += data_len;
1803
1804                 for (i = 0; npages > 0; i++) {
1805                         int order = max_page_order;
1806
1807                         while (order) {
1808                                 if (npages >= 1 << order) {
1809                                         page = alloc_pages(sk->sk_allocation |
1810                                                            __GFP_COMP |
1811                                                            __GFP_NOWARN |
1812                                                            __GFP_NORETRY,
1813                                                            order);
1814                                         if (page)
1815                                                 goto fill_page;
1816                                 }
1817                                 order--;
1818                         }
1819                         page = alloc_page(sk->sk_allocation);
1820                         if (!page)
1821                                 goto failure;
1822 fill_page:
1823                         chunk = min_t(unsigned long, data_len,
1824                                       PAGE_SIZE << order);
1825                         skb_fill_page_desc(skb, i, page, 0, chunk);
1826                         data_len -= chunk;
1827                         npages -= 1 << order;
1828                 }
1829         }
1830
1831         skb_set_owner_w(skb, sk);
1832         return skb;
1833
1834 interrupted:
1835         err = sock_intr_errno(timeo);
1836 failure:
1837         kfree_skb(skb);
1838         *errcode = err;
1839         return NULL;
1840 }
1841 EXPORT_SYMBOL(sock_alloc_send_pskb);
1842
1843 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1844                                     int noblock, int *errcode)
1845 {
1846         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1847 }
1848 EXPORT_SYMBOL(sock_alloc_send_skb);
1849
1850 /* On 32bit arches, an skb frag is limited to 2^15 */
1851 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1852
1853 /**
1854  * skb_page_frag_refill - check that a page_frag contains enough room
1855  * @sz: minimum size of the fragment we want to get
1856  * @pfrag: pointer to page_frag
1857  * @prio: priority for memory allocation
1858  *
1859  * Note: While this allocator tries to use high order pages, there is
1860  * no guarantee that allocations succeed. Therefore, @sz MUST be
1861  * less or equal than PAGE_SIZE.
1862  */
1863 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
1864 {
1865         int order;
1866
1867         if (pfrag->page) {
1868                 if (atomic_read(&pfrag->page->_count) == 1) {
1869                         pfrag->offset = 0;
1870                         return true;
1871                 }
1872                 if (pfrag->offset + sz <= pfrag->size)
1873                         return true;
1874                 put_page(pfrag->page);
1875         }
1876
1877         order = SKB_FRAG_PAGE_ORDER;
1878         do {
1879                 gfp_t gfp = prio;
1880
1881                 if (order)
1882                         gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
1883                 pfrag->page = alloc_pages(gfp, order);
1884                 if (likely(pfrag->page)) {
1885                         pfrag->offset = 0;
1886                         pfrag->size = PAGE_SIZE << order;
1887                         return true;
1888                 }
1889         } while (--order >= 0);
1890
1891         return false;
1892 }
1893 EXPORT_SYMBOL(skb_page_frag_refill);
1894
1895 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1896 {
1897         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1898                 return true;
1899
1900         sk_enter_memory_pressure(sk);
1901         sk_stream_moderate_sndbuf(sk);
1902         return false;
1903 }
1904 EXPORT_SYMBOL(sk_page_frag_refill);
1905
1906 static void __lock_sock(struct sock *sk)
1907         __releases(&sk->sk_lock.slock)
1908         __acquires(&sk->sk_lock.slock)
1909 {
1910         DEFINE_WAIT(wait);
1911
1912         for (;;) {
1913                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1914                                         TASK_UNINTERRUPTIBLE);
1915                 spin_unlock_bh(&sk->sk_lock.slock);
1916                 schedule();
1917                 spin_lock_bh(&sk->sk_lock.slock);
1918                 if (!sock_owned_by_user(sk))
1919                         break;
1920         }
1921         finish_wait(&sk->sk_lock.wq, &wait);
1922 }
1923
1924 static void __release_sock(struct sock *sk)
1925         __releases(&sk->sk_lock.slock)
1926         __acquires(&sk->sk_lock.slock)
1927 {
1928         struct sk_buff *skb = sk->sk_backlog.head;
1929
1930         do {
1931                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1932                 bh_unlock_sock(sk);
1933
1934                 do {
1935                         struct sk_buff *next = skb->next;
1936
1937                         prefetch(next);
1938                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1939                         skb->next = NULL;
1940                         sk_backlog_rcv(sk, skb);
1941
1942                         /*
1943                          * We are in process context here with softirqs
1944                          * disabled, use cond_resched_softirq() to preempt.
1945                          * This is safe to do because we've taken the backlog
1946                          * queue private:
1947                          */
1948                         cond_resched_softirq();
1949
1950                         skb = next;
1951                 } while (skb != NULL);
1952
1953                 bh_lock_sock(sk);
1954         } while ((skb = sk->sk_backlog.head) != NULL);
1955
1956         /*
1957          * Doing the zeroing here guarantee we can not loop forever
1958          * while a wild producer attempts to flood us.
1959          */
1960         sk->sk_backlog.len = 0;
1961 }
1962
1963 /**
1964  * sk_wait_data - wait for data to arrive at sk_receive_queue
1965  * @sk:    sock to wait on
1966  * @timeo: for how long
1967  *
1968  * Now socket state including sk->sk_err is changed only under lock,
1969  * hence we may omit checks after joining wait queue.
1970  * We check receive queue before schedule() only as optimization;
1971  * it is very likely that release_sock() added new data.
1972  */
1973 int sk_wait_data(struct sock *sk, long *timeo)
1974 {
1975         int rc;
1976         DEFINE_WAIT(wait);
1977
1978         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1979         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1980         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1981         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1982         finish_wait(sk_sleep(sk), &wait);
1983         return rc;
1984 }
1985 EXPORT_SYMBOL(sk_wait_data);
1986
1987 /**
1988  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1989  *      @sk: socket
1990  *      @size: memory size to allocate
1991  *      @kind: allocation type
1992  *
1993  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1994  *      rmem allocation. This function assumes that protocols which have
1995  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1996  */
1997 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1998 {
1999         struct proto *prot = sk->sk_prot;
2000         int amt = sk_mem_pages(size);
2001         long allocated;
2002         int parent_status = UNDER_LIMIT;
2003
2004         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2005
2006         allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2007
2008         /* Under limit. */
2009         if (parent_status == UNDER_LIMIT &&
2010                         allocated <= sk_prot_mem_limits(sk, 0)) {
2011                 sk_leave_memory_pressure(sk);
2012                 return 1;
2013         }
2014
2015         /* Under pressure. (we or our parents) */
2016         if ((parent_status > SOFT_LIMIT) ||
2017                         allocated > sk_prot_mem_limits(sk, 1))
2018                 sk_enter_memory_pressure(sk);
2019
2020         /* Over hard limit (we or our parents) */
2021         if ((parent_status == OVER_LIMIT) ||
2022                         (allocated > sk_prot_mem_limits(sk, 2)))
2023                 goto suppress_allocation;
2024
2025         /* guarantee minimum buffer size under pressure */
2026         if (kind == SK_MEM_RECV) {
2027                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2028                         return 1;
2029
2030         } else { /* SK_MEM_SEND */
2031                 if (sk->sk_type == SOCK_STREAM) {
2032                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2033                                 return 1;
2034                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2035                            prot->sysctl_wmem[0])
2036                                 return 1;
2037         }
2038
2039         if (sk_has_memory_pressure(sk)) {
2040                 int alloc;
2041
2042                 if (!sk_under_memory_pressure(sk))
2043                         return 1;
2044                 alloc = sk_sockets_allocated_read_positive(sk);
2045                 if (sk_prot_mem_limits(sk, 2) > alloc *
2046                     sk_mem_pages(sk->sk_wmem_queued +
2047                                  atomic_read(&sk->sk_rmem_alloc) +
2048                                  sk->sk_forward_alloc))
2049                         return 1;
2050         }
2051
2052 suppress_allocation:
2053
2054         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2055                 sk_stream_moderate_sndbuf(sk);
2056
2057                 /* Fail only if socket is _under_ its sndbuf.
2058                  * In this case we cannot block, so that we have to fail.
2059                  */
2060                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2061                         return 1;
2062         }
2063
2064         trace_sock_exceed_buf_limit(sk, prot, allocated);
2065
2066         /* Alas. Undo changes. */
2067         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2068
2069         sk_memory_allocated_sub(sk, amt);
2070
2071         return 0;
2072 }
2073 EXPORT_SYMBOL(__sk_mem_schedule);
2074
2075 /**
2076  *      __sk_reclaim - reclaim memory_allocated
2077  *      @sk: socket
2078  */
2079 void __sk_mem_reclaim(struct sock *sk)
2080 {
2081         sk_memory_allocated_sub(sk,
2082                                 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2083         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2084
2085         if (sk_under_memory_pressure(sk) &&
2086             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2087                 sk_leave_memory_pressure(sk);
2088 }
2089 EXPORT_SYMBOL(__sk_mem_reclaim);
2090
2091
2092 /*
2093  * Set of default routines for initialising struct proto_ops when
2094  * the protocol does not support a particular function. In certain
2095  * cases where it makes no sense for a protocol to have a "do nothing"
2096  * function, some default processing is provided.
2097  */
2098
2099 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2100 {
2101         return -EOPNOTSUPP;
2102 }
2103 EXPORT_SYMBOL(sock_no_bind);
2104
2105 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2106                     int len, int flags)
2107 {
2108         return -EOPNOTSUPP;
2109 }
2110 EXPORT_SYMBOL(sock_no_connect);
2111
2112 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2113 {
2114         return -EOPNOTSUPP;
2115 }
2116 EXPORT_SYMBOL(sock_no_socketpair);
2117
2118 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2119 {
2120         return -EOPNOTSUPP;
2121 }
2122 EXPORT_SYMBOL(sock_no_accept);
2123
2124 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2125                     int *len, int peer)
2126 {
2127         return -EOPNOTSUPP;
2128 }
2129 EXPORT_SYMBOL(sock_no_getname);
2130
2131 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2132 {
2133         return 0;
2134 }
2135 EXPORT_SYMBOL(sock_no_poll);
2136
2137 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2138 {
2139         return -EOPNOTSUPP;
2140 }
2141 EXPORT_SYMBOL(sock_no_ioctl);
2142
2143 int sock_no_listen(struct socket *sock, int backlog)
2144 {
2145         return -EOPNOTSUPP;
2146 }
2147 EXPORT_SYMBOL(sock_no_listen);
2148
2149 int sock_no_shutdown(struct socket *sock, int how)
2150 {
2151         return -EOPNOTSUPP;
2152 }
2153 EXPORT_SYMBOL(sock_no_shutdown);
2154
2155 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2156                     char __user *optval, unsigned int optlen)
2157 {
2158         return -EOPNOTSUPP;
2159 }
2160 EXPORT_SYMBOL(sock_no_setsockopt);
2161
2162 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2163                     char __user *optval, int __user *optlen)
2164 {
2165         return -EOPNOTSUPP;
2166 }
2167 EXPORT_SYMBOL(sock_no_getsockopt);
2168
2169 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2170                     size_t len)
2171 {
2172         return -EOPNOTSUPP;
2173 }
2174 EXPORT_SYMBOL(sock_no_sendmsg);
2175
2176 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2177                     size_t len, int flags)
2178 {
2179         return -EOPNOTSUPP;
2180 }
2181 EXPORT_SYMBOL(sock_no_recvmsg);
2182
2183 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2184 {
2185         /* Mirror missing mmap method error code */
2186         return -ENODEV;
2187 }
2188 EXPORT_SYMBOL(sock_no_mmap);
2189
2190 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2191 {
2192         ssize_t res;
2193         struct msghdr msg = {.msg_flags = flags};
2194         struct kvec iov;
2195         char *kaddr = kmap(page);
2196         iov.iov_base = kaddr + offset;
2197         iov.iov_len = size;
2198         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2199         kunmap(page);
2200         return res;
2201 }
2202 EXPORT_SYMBOL(sock_no_sendpage);
2203
2204 /*
2205  *      Default Socket Callbacks
2206  */
2207
2208 static void sock_def_wakeup(struct sock *sk)
2209 {
2210         struct socket_wq *wq;
2211
2212         rcu_read_lock();
2213         wq = rcu_dereference(sk->sk_wq);
2214         if (wq_has_sleeper(wq))
2215                 wake_up_interruptible_all(&wq->wait);
2216         rcu_read_unlock();
2217 }
2218
2219 static void sock_def_error_report(struct sock *sk)
2220 {
2221         struct socket_wq *wq;
2222
2223         rcu_read_lock();
2224         wq = rcu_dereference(sk->sk_wq);
2225         if (wq_has_sleeper(wq))
2226                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2227         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2228         rcu_read_unlock();
2229 }
2230
2231 static void sock_def_readable(struct sock *sk)
2232 {
2233         struct socket_wq *wq;
2234
2235         rcu_read_lock();
2236         wq = rcu_dereference(sk->sk_wq);
2237         if (wq_has_sleeper(wq))
2238                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2239                                                 POLLRDNORM | POLLRDBAND);
2240         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2241         rcu_read_unlock();
2242 }
2243
2244 static void sock_def_write_space(struct sock *sk)
2245 {
2246         struct socket_wq *wq;
2247
2248         rcu_read_lock();
2249
2250         /* Do not wake up a writer until he can make "significant"
2251          * progress.  --DaveM
2252          */
2253         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2254                 wq = rcu_dereference(sk->sk_wq);
2255                 if (wq_has_sleeper(wq))
2256                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2257                                                 POLLWRNORM | POLLWRBAND);
2258
2259                 /* Should agree with poll, otherwise some programs break */
2260                 if (sock_writeable(sk))
2261                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2262         }
2263
2264         rcu_read_unlock();
2265 }
2266
2267 static void sock_def_destruct(struct sock *sk)
2268 {
2269         kfree(sk->sk_protinfo);
2270 }
2271
2272 void sk_send_sigurg(struct sock *sk)
2273 {
2274         if (sk->sk_socket && sk->sk_socket->file)
2275                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2276                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2277 }
2278 EXPORT_SYMBOL(sk_send_sigurg);
2279
2280 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2281                     unsigned long expires)
2282 {
2283         if (!mod_timer(timer, expires))
2284                 sock_hold(sk);
2285 }
2286 EXPORT_SYMBOL(sk_reset_timer);
2287
2288 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2289 {
2290         if (del_timer(timer))
2291                 __sock_put(sk);
2292 }
2293 EXPORT_SYMBOL(sk_stop_timer);
2294
2295 void sock_init_data(struct socket *sock, struct sock *sk)
2296 {
2297         skb_queue_head_init(&sk->sk_receive_queue);
2298         skb_queue_head_init(&sk->sk_write_queue);
2299         skb_queue_head_init(&sk->sk_error_queue);
2300 #ifdef CONFIG_NET_DMA
2301         skb_queue_head_init(&sk->sk_async_wait_queue);
2302 #endif
2303
2304         sk->sk_send_head        =       NULL;
2305
2306         init_timer(&sk->sk_timer);
2307
2308         sk->sk_allocation       =       GFP_KERNEL;
2309         sk->sk_rcvbuf           =       sysctl_rmem_default;
2310         sk->sk_sndbuf           =       sysctl_wmem_default;
2311         sk->sk_state            =       TCP_CLOSE;
2312         sk_set_socket(sk, sock);
2313
2314         sock_set_flag(sk, SOCK_ZAPPED);
2315
2316         if (sock) {
2317                 sk->sk_type     =       sock->type;
2318                 sk->sk_wq       =       sock->wq;
2319                 sock->sk        =       sk;
2320         } else
2321                 sk->sk_wq       =       NULL;
2322
2323         spin_lock_init(&sk->sk_dst_lock);
2324         rwlock_init(&sk->sk_callback_lock);
2325         lockdep_set_class_and_name(&sk->sk_callback_lock,
2326                         af_callback_keys + sk->sk_family,
2327                         af_family_clock_key_strings[sk->sk_family]);
2328
2329         sk->sk_state_change     =       sock_def_wakeup;
2330         sk->sk_data_ready       =       sock_def_readable;
2331         sk->sk_write_space      =       sock_def_write_space;
2332         sk->sk_error_report     =       sock_def_error_report;
2333         sk->sk_destruct         =       sock_def_destruct;
2334
2335         sk->sk_frag.page        =       NULL;
2336         sk->sk_frag.offset      =       0;
2337         sk->sk_peek_off         =       -1;
2338
2339         sk->sk_peer_pid         =       NULL;
2340         sk->sk_peer_cred        =       NULL;
2341         sk->sk_write_pending    =       0;
2342         sk->sk_rcvlowat         =       1;
2343         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2344         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2345
2346         sk->sk_stamp = ktime_set(-1L, 0);
2347
2348 #ifdef CONFIG_NET_RX_BUSY_POLL
2349         sk->sk_napi_id          =       0;
2350         sk->sk_ll_usec          =       sysctl_net_busy_read;
2351 #endif
2352
2353         sk->sk_max_pacing_rate = ~0U;
2354         sk->sk_pacing_rate = ~0U;
2355         /*
2356          * Before updating sk_refcnt, we must commit prior changes to memory
2357          * (Documentation/RCU/rculist_nulls.txt for details)
2358          */
2359         smp_wmb();
2360         atomic_set(&sk->sk_refcnt, 1);
2361         atomic_set(&sk->sk_drops, 0);
2362 }
2363 EXPORT_SYMBOL(sock_init_data);
2364
2365 void lock_sock_nested(struct sock *sk, int subclass)
2366 {
2367         might_sleep();
2368         spin_lock_bh(&sk->sk_lock.slock);
2369         if (sk->sk_lock.owned)
2370                 __lock_sock(sk);
2371         sk->sk_lock.owned = 1;
2372         spin_unlock(&sk->sk_lock.slock);
2373         /*
2374          * The sk_lock has mutex_lock() semantics here:
2375          */
2376         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2377         local_bh_enable();
2378 }
2379 EXPORT_SYMBOL(lock_sock_nested);
2380
2381 void release_sock(struct sock *sk)
2382 {
2383         /*
2384          * The sk_lock has mutex_unlock() semantics:
2385          */
2386         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2387
2388         spin_lock_bh(&sk->sk_lock.slock);
2389         if (sk->sk_backlog.tail)
2390                 __release_sock(sk);
2391
2392         /* Warning : release_cb() might need to release sk ownership,
2393          * ie call sock_release_ownership(sk) before us.
2394          */
2395         if (sk->sk_prot->release_cb)
2396                 sk->sk_prot->release_cb(sk);
2397
2398         sock_release_ownership(sk);
2399         if (waitqueue_active(&sk->sk_lock.wq))
2400                 wake_up(&sk->sk_lock.wq);
2401         spin_unlock_bh(&sk->sk_lock.slock);
2402 }
2403 EXPORT_SYMBOL(release_sock);
2404
2405 /**
2406  * lock_sock_fast - fast version of lock_sock
2407  * @sk: socket
2408  *
2409  * This version should be used for very small section, where process wont block
2410  * return false if fast path is taken
2411  *   sk_lock.slock locked, owned = 0, BH disabled
2412  * return true if slow path is taken
2413  *   sk_lock.slock unlocked, owned = 1, BH enabled
2414  */
2415 bool lock_sock_fast(struct sock *sk)
2416 {
2417         might_sleep();
2418         spin_lock_bh(&sk->sk_lock.slock);
2419
2420         if (!sk->sk_lock.owned)
2421                 /*
2422                  * Note : We must disable BH
2423                  */
2424                 return false;
2425
2426         __lock_sock(sk);
2427         sk->sk_lock.owned = 1;
2428         spin_unlock(&sk->sk_lock.slock);
2429         /*
2430          * The sk_lock has mutex_lock() semantics here:
2431          */
2432         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2433         local_bh_enable();
2434         return true;
2435 }
2436 EXPORT_SYMBOL(lock_sock_fast);
2437
2438 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2439 {
2440         struct timeval tv;
2441         if (!sock_flag(sk, SOCK_TIMESTAMP))
2442                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2443         tv = ktime_to_timeval(sk->sk_stamp);
2444         if (tv.tv_sec == -1)
2445                 return -ENOENT;
2446         if (tv.tv_sec == 0) {
2447                 sk->sk_stamp = ktime_get_real();
2448                 tv = ktime_to_timeval(sk->sk_stamp);
2449         }
2450         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2451 }
2452 EXPORT_SYMBOL(sock_get_timestamp);
2453
2454 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2455 {
2456         struct timespec ts;
2457         if (!sock_flag(sk, SOCK_TIMESTAMP))
2458                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2459         ts = ktime_to_timespec(sk->sk_stamp);
2460         if (ts.tv_sec == -1)
2461                 return -ENOENT;
2462         if (ts.tv_sec == 0) {
2463                 sk->sk_stamp = ktime_get_real();
2464                 ts = ktime_to_timespec(sk->sk_stamp);
2465         }
2466         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2467 }
2468 EXPORT_SYMBOL(sock_get_timestampns);
2469
2470 void sock_enable_timestamp(struct sock *sk, int flag)
2471 {
2472         if (!sock_flag(sk, flag)) {
2473                 unsigned long previous_flags = sk->sk_flags;
2474
2475                 sock_set_flag(sk, flag);
2476                 /*
2477                  * we just set one of the two flags which require net
2478                  * time stamping, but time stamping might have been on
2479                  * already because of the other one
2480                  */
2481                 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2482                         net_enable_timestamp();
2483         }
2484 }
2485
2486 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2487                        int level, int type)
2488 {
2489         struct sock_exterr_skb *serr;
2490         struct sk_buff *skb, *skb2;
2491         int copied, err;
2492
2493         err = -EAGAIN;
2494         skb = skb_dequeue(&sk->sk_error_queue);
2495         if (skb == NULL)
2496                 goto out;
2497
2498         copied = skb->len;
2499         if (copied > len) {
2500                 msg->msg_flags |= MSG_TRUNC;
2501                 copied = len;
2502         }
2503         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2504         if (err)
2505                 goto out_free_skb;
2506
2507         sock_recv_timestamp(msg, sk, skb);
2508
2509         serr = SKB_EXT_ERR(skb);
2510         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2511
2512         msg->msg_flags |= MSG_ERRQUEUE;
2513         err = copied;
2514
2515         /* Reset and regenerate socket error */
2516         spin_lock_bh(&sk->sk_error_queue.lock);
2517         sk->sk_err = 0;
2518         if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2519                 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2520                 spin_unlock_bh(&sk->sk_error_queue.lock);
2521                 sk->sk_error_report(sk);
2522         } else
2523                 spin_unlock_bh(&sk->sk_error_queue.lock);
2524
2525 out_free_skb:
2526         kfree_skb(skb);
2527 out:
2528         return err;
2529 }
2530 EXPORT_SYMBOL(sock_recv_errqueue);
2531
2532 /*
2533  *      Get a socket option on an socket.
2534  *
2535  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2536  *      asynchronous errors should be reported by getsockopt. We assume
2537  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2538  */
2539 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2540                            char __user *optval, int __user *optlen)
2541 {
2542         struct sock *sk = sock->sk;
2543
2544         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2545 }
2546 EXPORT_SYMBOL(sock_common_getsockopt);
2547
2548 #ifdef CONFIG_COMPAT
2549 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2550                                   char __user *optval, int __user *optlen)
2551 {
2552         struct sock *sk = sock->sk;
2553
2554         if (sk->sk_prot->compat_getsockopt != NULL)
2555                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2556                                                       optval, optlen);
2557         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2558 }
2559 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2560 #endif
2561
2562 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2563                         struct msghdr *msg, size_t size, int flags)
2564 {
2565         struct sock *sk = sock->sk;
2566         int addr_len = 0;
2567         int err;
2568
2569         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2570                                    flags & ~MSG_DONTWAIT, &addr_len);
2571         if (err >= 0)
2572                 msg->msg_namelen = addr_len;
2573         return err;
2574 }
2575 EXPORT_SYMBOL(sock_common_recvmsg);
2576
2577 /*
2578  *      Set socket options on an inet socket.
2579  */
2580 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2581                            char __user *optval, unsigned int optlen)
2582 {
2583         struct sock *sk = sock->sk;
2584
2585         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2586 }
2587 EXPORT_SYMBOL(sock_common_setsockopt);
2588
2589 #ifdef CONFIG_COMPAT
2590 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2591                                   char __user *optval, unsigned int optlen)
2592 {
2593         struct sock *sk = sock->sk;
2594
2595         if (sk->sk_prot->compat_setsockopt != NULL)
2596                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2597                                                       optval, optlen);
2598         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2599 }
2600 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2601 #endif
2602
2603 void sk_common_release(struct sock *sk)
2604 {
2605         if (sk->sk_prot->destroy)
2606                 sk->sk_prot->destroy(sk);
2607
2608         /*
2609          * Observation: when sock_common_release is called, processes have
2610          * no access to socket. But net still has.
2611          * Step one, detach it from networking:
2612          *
2613          * A. Remove from hash tables.
2614          */
2615
2616         sk->sk_prot->unhash(sk);
2617
2618         /*
2619          * In this point socket cannot receive new packets, but it is possible
2620          * that some packets are in flight because some CPU runs receiver and
2621          * did hash table lookup before we unhashed socket. They will achieve
2622          * receive queue and will be purged by socket destructor.
2623          *
2624          * Also we still have packets pending on receive queue and probably,
2625          * our own packets waiting in device queues. sock_destroy will drain
2626          * receive queue, but transmitted packets will delay socket destruction
2627          * until the last reference will be released.
2628          */
2629
2630         sock_orphan(sk);
2631
2632         xfrm_sk_free_policy(sk);
2633
2634         sk_refcnt_debug_release(sk);
2635
2636         if (sk->sk_frag.page) {
2637                 put_page(sk->sk_frag.page);
2638                 sk->sk_frag.page = NULL;
2639         }
2640
2641         sock_put(sk);
2642 }
2643 EXPORT_SYMBOL(sk_common_release);
2644
2645 #ifdef CONFIG_PROC_FS
2646 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2647 struct prot_inuse {
2648         int val[PROTO_INUSE_NR];
2649 };
2650
2651 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2652
2653 #ifdef CONFIG_NET_NS
2654 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2655 {
2656         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2657 }
2658 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2659
2660 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2661 {
2662         int cpu, idx = prot->inuse_idx;
2663         int res = 0;
2664
2665         for_each_possible_cpu(cpu)
2666                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2667
2668         return res >= 0 ? res : 0;
2669 }
2670 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2671
2672 static int __net_init sock_inuse_init_net(struct net *net)
2673 {
2674         net->core.inuse = alloc_percpu(struct prot_inuse);
2675         return net->core.inuse ? 0 : -ENOMEM;
2676 }
2677
2678 static void __net_exit sock_inuse_exit_net(struct net *net)
2679 {
2680         free_percpu(net->core.inuse);
2681 }
2682
2683 static struct pernet_operations net_inuse_ops = {
2684         .init = sock_inuse_init_net,
2685         .exit = sock_inuse_exit_net,
2686 };
2687
2688 static __init int net_inuse_init(void)
2689 {
2690         if (register_pernet_subsys(&net_inuse_ops))
2691                 panic("Cannot initialize net inuse counters");
2692
2693         return 0;
2694 }
2695
2696 core_initcall(net_inuse_init);
2697 #else
2698 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2699
2700 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2701 {
2702         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2703 }
2704 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2705
2706 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2707 {
2708         int cpu, idx = prot->inuse_idx;
2709         int res = 0;
2710
2711         for_each_possible_cpu(cpu)
2712                 res += per_cpu(prot_inuse, cpu).val[idx];
2713
2714         return res >= 0 ? res : 0;
2715 }
2716 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2717 #endif
2718
2719 static void assign_proto_idx(struct proto *prot)
2720 {
2721         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2722
2723         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2724                 pr_err("PROTO_INUSE_NR exhausted\n");
2725                 return;
2726         }
2727
2728         set_bit(prot->inuse_idx, proto_inuse_idx);
2729 }
2730
2731 static void release_proto_idx(struct proto *prot)
2732 {
2733         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2734                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2735 }
2736 #else
2737 static inline void assign_proto_idx(struct proto *prot)
2738 {
2739 }
2740
2741 static inline void release_proto_idx(struct proto *prot)
2742 {
2743 }
2744 #endif
2745
2746 int proto_register(struct proto *prot, int alloc_slab)
2747 {
2748         if (alloc_slab) {
2749                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2750                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2751                                         NULL);
2752
2753                 if (prot->slab == NULL) {
2754                         pr_crit("%s: Can't create sock SLAB cache!\n",
2755                                 prot->name);
2756                         goto out;
2757                 }
2758
2759                 if (prot->rsk_prot != NULL) {
2760                         prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2761                         if (prot->rsk_prot->slab_name == NULL)
2762                                 goto out_free_sock_slab;
2763
2764                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2765                                                                  prot->rsk_prot->obj_size, 0,
2766                                                                  SLAB_HWCACHE_ALIGN, NULL);
2767
2768                         if (prot->rsk_prot->slab == NULL) {
2769                                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2770                                         prot->name);
2771                                 goto out_free_request_sock_slab_name;
2772                         }
2773                 }
2774
2775                 if (prot->twsk_prot != NULL) {
2776                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2777
2778                         if (prot->twsk_prot->twsk_slab_name == NULL)
2779                                 goto out_free_request_sock_slab;
2780
2781                         prot->twsk_prot->twsk_slab =
2782                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2783                                                   prot->twsk_prot->twsk_obj_size,
2784                                                   0,
2785                                                   SLAB_HWCACHE_ALIGN |
2786                                                         prot->slab_flags,
2787                                                   NULL);
2788                         if (prot->twsk_prot->twsk_slab == NULL)
2789                                 goto out_free_timewait_sock_slab_name;
2790                 }
2791         }
2792
2793         mutex_lock(&proto_list_mutex);
2794         list_add(&prot->node, &proto_list);
2795         assign_proto_idx(prot);
2796         mutex_unlock(&proto_list_mutex);
2797         return 0;
2798
2799 out_free_timewait_sock_slab_name:
2800         kfree(prot->twsk_prot->twsk_slab_name);
2801 out_free_request_sock_slab:
2802         if (prot->rsk_prot && prot->rsk_prot->slab) {
2803                 kmem_cache_destroy(prot->rsk_prot->slab);
2804                 prot->rsk_prot->slab = NULL;
2805         }
2806 out_free_request_sock_slab_name:
2807         if (prot->rsk_prot)
2808                 kfree(prot->rsk_prot->slab_name);
2809 out_free_sock_slab:
2810         kmem_cache_destroy(prot->slab);
2811         prot->slab = NULL;
2812 out:
2813         return -ENOBUFS;
2814 }
2815 EXPORT_SYMBOL(proto_register);
2816
2817 void proto_unregister(struct proto *prot)
2818 {
2819         mutex_lock(&proto_list_mutex);
2820         release_proto_idx(prot);
2821         list_del(&prot->node);
2822         mutex_unlock(&proto_list_mutex);
2823
2824         if (prot->slab != NULL) {
2825                 kmem_cache_destroy(prot->slab);
2826                 prot->slab = NULL;
2827         }
2828
2829         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2830                 kmem_cache_destroy(prot->rsk_prot->slab);
2831                 kfree(prot->rsk_prot->slab_name);
2832                 prot->rsk_prot->slab = NULL;
2833         }
2834
2835         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2836                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2837                 kfree(prot->twsk_prot->twsk_slab_name);
2838                 prot->twsk_prot->twsk_slab = NULL;
2839         }
2840 }
2841 EXPORT_SYMBOL(proto_unregister);
2842
2843 #ifdef CONFIG_PROC_FS
2844 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2845         __acquires(proto_list_mutex)
2846 {
2847         mutex_lock(&proto_list_mutex);
2848         return seq_list_start_head(&proto_list, *pos);
2849 }
2850
2851 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2852 {
2853         return seq_list_next(v, &proto_list, pos);
2854 }
2855
2856 static void proto_seq_stop(struct seq_file *seq, void *v)
2857         __releases(proto_list_mutex)
2858 {
2859         mutex_unlock(&proto_list_mutex);
2860 }
2861
2862 static char proto_method_implemented(const void *method)
2863 {
2864         return method == NULL ? 'n' : 'y';
2865 }
2866 static long sock_prot_memory_allocated(struct proto *proto)
2867 {
2868         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2869 }
2870
2871 static char *sock_prot_memory_pressure(struct proto *proto)
2872 {
2873         return proto->memory_pressure != NULL ?
2874         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2875 }
2876
2877 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2878 {
2879
2880         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2881                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2882                    proto->name,
2883                    proto->obj_size,
2884                    sock_prot_inuse_get(seq_file_net(seq), proto),
2885                    sock_prot_memory_allocated(proto),
2886                    sock_prot_memory_pressure(proto),
2887                    proto->max_header,
2888                    proto->slab == NULL ? "no" : "yes",
2889                    module_name(proto->owner),
2890                    proto_method_implemented(proto->close),
2891                    proto_method_implemented(proto->connect),
2892                    proto_method_implemented(proto->disconnect),
2893                    proto_method_implemented(proto->accept),
2894                    proto_method_implemented(proto->ioctl),
2895                    proto_method_implemented(proto->init),
2896                    proto_method_implemented(proto->destroy),
2897                    proto_method_implemented(proto->shutdown),
2898                    proto_method_implemented(proto->setsockopt),
2899                    proto_method_implemented(proto->getsockopt),
2900                    proto_method_implemented(proto->sendmsg),
2901                    proto_method_implemented(proto->recvmsg),
2902                    proto_method_implemented(proto->sendpage),
2903                    proto_method_implemented(proto->bind),
2904                    proto_method_implemented(proto->backlog_rcv),
2905                    proto_method_implemented(proto->hash),
2906                    proto_method_implemented(proto->unhash),
2907                    proto_method_implemented(proto->get_port),
2908                    proto_method_implemented(proto->enter_memory_pressure));
2909 }
2910
2911 static int proto_seq_show(struct seq_file *seq, void *v)
2912 {
2913         if (v == &proto_list)
2914                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2915                            "protocol",
2916                            "size",
2917                            "sockets",
2918                            "memory",
2919                            "press",
2920                            "maxhdr",
2921                            "slab",
2922                            "module",
2923                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2924         else
2925                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2926         return 0;
2927 }
2928
2929 static const struct seq_operations proto_seq_ops = {
2930         .start  = proto_seq_start,
2931         .next   = proto_seq_next,
2932         .stop   = proto_seq_stop,
2933         .show   = proto_seq_show,
2934 };
2935
2936 static int proto_seq_open(struct inode *inode, struct file *file)
2937 {
2938         return seq_open_net(inode, file, &proto_seq_ops,
2939                             sizeof(struct seq_net_private));
2940 }
2941
2942 static const struct file_operations proto_seq_fops = {
2943         .owner          = THIS_MODULE,
2944         .open           = proto_seq_open,
2945         .read           = seq_read,
2946         .llseek         = seq_lseek,
2947         .release        = seq_release_net,
2948 };
2949
2950 static __net_init int proto_init_net(struct net *net)
2951 {
2952         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2953                 return -ENOMEM;
2954
2955         return 0;
2956 }
2957
2958 static __net_exit void proto_exit_net(struct net *net)
2959 {
2960         remove_proc_entry("protocols", net->proc_net);
2961 }
2962
2963
2964 static __net_initdata struct pernet_operations proto_net_ops = {
2965         .init = proto_init_net,
2966         .exit = proto_exit_net,
2967 };
2968
2969 static int __init proto_init(void)
2970 {
2971         return register_pernet_subsys(&proto_net_ops);
2972 }
2973
2974 subsys_initcall(proto_init);
2975
2976 #endif /* PROC_FS */