net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #include <linux/module.h>
  84 #include <linux/kernel.h>
  85 #include <linux/signal.h>
  86 #include <linux/sched.h>
  87 #include <linux/errno.h>
  88 #include <linux/string.h>
  89 #include <linux/stat.h>
  90 #include <linux/dcache.h>
  91 #include <linux/namei.h>
  92 #include <linux/socket.h>
  93 #include <linux/un.h>
  94 #include <linux/fcntl.h>
  95 #include <linux/termios.h>
  96 #include <linux/sockios.h>
  97 #include <linux/net.h>
  98 #include <linux/in.h>
  99 #include <linux/fs.h>
 100 #include <linux/slab.h>
 101 #include <asm/uaccess.h>
 102 #include <linux/skbuff.h>
 103 #include <linux/netdevice.h>
 104 #include <net/net_namespace.h>
 105 #include <net/sock.h>
 106 #include <net/tcp_states.h>
 107 #include <net/af_unix.h>
 108 #include <linux/proc_fs.h>
 109 #include <linux/seq_file.h>
 110 #include <net/scm.h>
 111 #include <linux/init.h>
 112 #include <linux/poll.h>
 113 #include <linux/rtnetlink.h>
 114 #include <linux/mount.h>
 115 #include <net/checksum.h>
 116 #include <linux/security.h>
 117
 118 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 119 EXPORT_SYMBOL_GPL(unix_socket_table);
 120 DEFINE_SPINLOCK(unix_table_lock);
 121 EXPORT_SYMBOL_GPL(unix_table_lock);
 122 static atomic_long_t unix_nr_socks;
 123
 124 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
 125
 126 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
 127
 128 #ifdef CONFIG_SECURITY_NETWORK
 129 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 130 {
 131         memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
 132 }
 133
 134 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 135 {
 136         scm->secid = *UNIXSID(skb);
 137 }
 138 #else
 139 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 140 { }
 141
 142 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 143 { }
 144 #endif /* CONFIG_SECURITY_NETWORK */
 145
 146 /*
 147  *  SMP locking strategy:
 148  *    hash table is protected with spinlock unix_table_lock
 149  *    each socket state is protected by separate spin lock.
 150  */
 151
 152 static inline unsigned int unix_hash_fold(__wsum n)
 153 {
 154         unsigned int hash = (__force unsigned int)n;
 155
 156         hash ^= hash>>16;
 157         hash ^= hash>>8;
 158         return hash&(UNIX_HASH_SIZE-1);
 159 }
 160
 161 #define unix_peer(sk) (unix_sk(sk)->peer)
 162
 163 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 164 {
 165         return unix_peer(osk) == sk;
 166 }
 167
 168 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 169 {
 170         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 171 }
 172
 173 static inline int unix_recvq_full(struct sock const *sk)
 174 {
 175         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 176 }
 177
 178 struct sock *unix_peer_get(struct sock *s)
 179 {
 180         struct sock *peer;
 181
 182         unix_state_lock(s);
 183         peer = unix_peer(s);
 184         if (peer)
 185                 sock_hold(peer);
 186         unix_state_unlock(s);
 187         return peer;
 188 }
 189 EXPORT_SYMBOL_GPL(unix_peer_get);
 190
 191 static inline void unix_release_addr(struct unix_address *addr)
 192 {
 193         if (atomic_dec_and_test(&addr->refcnt))
 194                 kfree(addr);
 195 }
 196
 197 /*
 198  *      Check unix socket name:
 199  *              - should be not zero length.
 200  *              - if started by not zero, should be NULL terminated (FS object)
 201  *              - if started by zero, it is abstract name.
 202  */
 203
 204 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 205 {
 206         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 207                 return -EINVAL;
 208         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 209                 return -EINVAL;
 210         if (sunaddr->sun_path[0]) {
 211                 /*
 212                  * This may look like an off by one error but it is a bit more
 213                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 214                  * sun_path[108] doesn't as such exist.  However in kernel space
 215                  * we are guaranteed that it is a valid memory location in our
 216                  * kernel address buffer.
 217                  */
 218                 ((char *)sunaddr)[len] = 0;
 219                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 220                 return len;
 221         }
 222
 223         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 224         return len;
 225 }
 226
 227 static void __unix_remove_socket(struct sock *sk)
 228 {
 229         sk_del_node_init(sk);
 230 }
 231
 232 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 233 {
 234         WARN_ON(!sk_unhashed(sk));
 235         sk_add_node(sk, list);
 236 }
 237
 238 static inline void unix_remove_socket(struct sock *sk)
 239 {
 240         spin_lock(&unix_table_lock);
 241         __unix_remove_socket(sk);
 242         spin_unlock(&unix_table_lock);
 243 }
 244
 245 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 246 {
 247         spin_lock(&unix_table_lock);
 248         __unix_insert_socket(list, sk);
 249         spin_unlock(&unix_table_lock);
 250 }
 251
 252 static struct sock *__unix_find_socket_byname(struct net *net,
 253                                               struct sockaddr_un *sunname,
 254                                               int len, int type, unsigned int hash)
 255 {
 256         struct sock *s;
 257         struct hlist_node *node;
 258
 259         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
 260                 struct unix_sock *u = unix_sk(s);
 261
 262                 if (!net_eq(sock_net(s), net))
 263                         continue;
 264
 265                 if (u->addr->len == len &&
 266                     !memcmp(u->addr->name, sunname, len))
 267                         goto found;
 268         }
 269         s = NULL;
 270 found:
 271         return s;
 272 }
 273
 274 static inline struct sock *unix_find_socket_byname(struct net *net,
 275                                                    struct sockaddr_un *sunname,
 276                                                    int len, int type,
 277                                                    unsigned int hash)
 278 {
 279         struct sock *s;
 280
 281         spin_lock(&unix_table_lock);
 282         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 283         if (s)
 284                 sock_hold(s);
 285         spin_unlock(&unix_table_lock);
 286         return s;
 287 }
 288
 289 static struct sock *unix_find_socket_byinode(struct inode *i)
 290 {
 291         struct sock *s;
 292         struct hlist_node *node;
 293
 294         spin_lock(&unix_table_lock);
 295         sk_for_each(s, node,
 296                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 297                 struct dentry *dentry = unix_sk(s)->path.dentry;
 298
 299                 if (dentry && dentry->d_inode == i) {
 300                         sock_hold(s);
 301                         goto found;
 302                 }
 303         }
 304         s = NULL;
 305 found:
 306         spin_unlock(&unix_table_lock);
 307         return s;
 308 }
 309
 310 static inline int unix_writable(struct sock *sk)
 311 {
 312         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 313 }
 314
 315 static void unix_write_space(struct sock *sk)
 316 {
 317         struct socket_wq *wq;
 318
 319         rcu_read_lock();
 320         if (unix_writable(sk)) {
 321                 wq = rcu_dereference(sk->sk_wq);
 322                 if (wq_has_sleeper(wq))
 323                         wake_up_interruptible_sync_poll(&wq->wait,
 324                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 325                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 326         }
 327         rcu_read_unlock();
 328 }
 329
 330 /* When dgram socket disconnects (or changes its peer), we clear its receive
 331  * queue of packets arrived from previous peer. First, it allows to do
 332  * flow control based only on wmem_alloc; second, sk connected to peer
 333  * may receive messages only from that peer. */
 334 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 335 {
 336         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 337                 skb_queue_purge(&sk->sk_receive_queue);
 338                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 339
 340                 /* If one link of bidirectional dgram pipe is disconnected,
 341                  * we signal error. Messages are lost. Do not make this,
 342                  * when peer was not connected to us.
 343                  */
 344                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 345                         other->sk_err = ECONNRESET;
 346                         other->sk_error_report(other);
 347                 }
 348         }
 349 }
 350
 351 static void unix_sock_destructor(struct sock *sk)
 352 {
 353         struct unix_sock *u = unix_sk(sk);
 354
 355         skb_queue_purge(&sk->sk_receive_queue);
 356
 357         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 358         WARN_ON(!sk_unhashed(sk));
 359         WARN_ON(sk->sk_socket);
 360         if (!sock_flag(sk, SOCK_DEAD)) {
 361                 printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
 362                 return;
 363         }
 364
 365         if (u->addr)
 366                 unix_release_addr(u->addr);
 367
 368         atomic_long_dec(&unix_nr_socks);
 369         local_bh_disable();
 370         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 371         local_bh_enable();
 372 #ifdef UNIX_REFCNT_DEBUG
 373         printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
 374                 atomic_long_read(&unix_nr_socks));
 375 #endif
 376 }
 377
 378 static int unix_release_sock(struct sock *sk, int embrion)
 379 {
 380         struct unix_sock *u = unix_sk(sk);
 381         struct path path;
 382         struct sock *skpair;
 383         struct sk_buff *skb;
 384         int state;
 385
 386         unix_remove_socket(sk);
 387
 388         /* Clear state */
 389         unix_state_lock(sk);
 390         sock_orphan(sk);
 391         sk->sk_shutdown = SHUTDOWN_MASK;
 392         path         = u->path;
 393         u->path.dentry = NULL;
 394         u->path.mnt = NULL;
 395         state = sk->sk_state;
 396         sk->sk_state = TCP_CLOSE;
 397         unix_state_unlock(sk);
 398
 399         wake_up_interruptible_all(&u->peer_wait);
 400
 401         skpair = unix_peer(sk);
 402
 403         if (skpair != NULL) {
 404                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 405                         unix_state_lock(skpair);
 406                         /* No more writes */
 407                         skpair->sk_shutdown = SHUTDOWN_MASK;
 408                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 409                                 skpair->sk_err = ECONNRESET;
 410                         unix_state_unlock(skpair);
 411                         skpair->sk_state_change(skpair);
 412                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 413                 }
 414                 sock_put(skpair); /* It may now die */
 415                 unix_peer(sk) = NULL;
 416         }
 417
 418         /* Try to flush out this socket. Throw out buffers at least */
 419
 420         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 421                 if (state == TCP_LISTEN)
 422                         unix_release_sock(skb->sk, 1);
 423                 /* passed fds are erased in the kfree_skb hook        */
 424                 kfree_skb(skb);
 425         }
 426
 427         if (path.dentry)
 428                 path_put(&path);
 429
 430         sock_put(sk);
 431
 432         /* ---- Socket is dead now and most probably destroyed ---- */
 433
 434         /*
 435          * Fixme: BSD difference: In BSD all sockets connected to use get
 436          *        ECONNRESET and we die on the spot. In Linux we behave
 437          *        like files and pipes do and wait for the last
 438          *        dereference.
 439          *
 440          * Can't we simply set sock->err?
 441          *
 442          *        What the above comment does talk about? --ANK(980817)
 443          */
 444
 445         if (unix_tot_inflight)
 446                 unix_gc();              /* Garbage collect fds */
 447
 448         return 0;
 449 }
 450
 451 static void init_peercred(struct sock *sk)
 452 {
 453         put_pid(sk->sk_peer_pid);
 454         if (sk->sk_peer_cred)
 455                 put_cred(sk->sk_peer_cred);
 456         sk->sk_peer_pid  = get_pid(task_tgid(current));
 457         sk->sk_peer_cred = get_current_cred();
 458 }
 459
 460 static void copy_peercred(struct sock *sk, struct sock *peersk)
 461 {
 462         put_pid(sk->sk_peer_pid);
 463         if (sk->sk_peer_cred)
 464                 put_cred(sk->sk_peer_cred);
 465         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 466         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 467 }
 468
 469 static int unix_listen(struct socket *sock, int backlog)
 470 {
 471         int err;
 472         struct sock *sk = sock->sk;
 473         struct unix_sock *u = unix_sk(sk);
 474         struct pid *old_pid = NULL;
 475         const struct cred *old_cred = NULL;
 476
 477         err = -EOPNOTSUPP;
 478         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 479                 goto out;       /* Only stream/seqpacket sockets accept */
 480         err = -EINVAL;
 481         if (!u->addr)
 482                 goto out;       /* No listens on an unbound socket */
 483         unix_state_lock(sk);
 484         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 485                 goto out_unlock;
 486         if (backlog > sk->sk_max_ack_backlog)
 487                 wake_up_interruptible_all(&u->peer_wait);
 488         sk->sk_max_ack_backlog  = backlog;
 489         sk->sk_state            = TCP_LISTEN;
 490         /* set credentials so connect can copy them */
 491         init_peercred(sk);
 492         err = 0;
 493
 494 out_unlock:
 495         unix_state_unlock(sk);
 496         put_pid(old_pid);
 497         if (old_cred)
 498                 put_cred(old_cred);
 499 out:
 500         return err;
 501 }
 502
 503 static int unix_release(struct socket *);
 504 static int unix_bind(struct socket *, struct sockaddr *, int);
 505 static int unix_stream_connect(struct socket *, struct sockaddr *,
 506                                int addr_len, int flags);
 507 static int unix_socketpair(struct socket *, struct socket *);
 508 static int unix_accept(struct socket *, struct socket *, int);
 509 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 510 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 511 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 512                                     poll_table *);
 513 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 514 static int unix_shutdown(struct socket *, int);
 515 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
 516                                struct msghdr *, size_t);
 517 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
 518                                struct msghdr *, size_t, int);
 519 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
 520                               struct msghdr *, size_t);
 521 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
 522                               struct msghdr *, size_t, int);
 523 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 524                               int, int);
 525 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
 526                                   struct msghdr *, size_t);
 527 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
 528                                   struct msghdr *, size_t, int);
 529
 530 static void unix_set_peek_off(struct sock *sk, int val)
 531 {
 532         struct unix_sock *u = unix_sk(sk);
 533
 534         mutex_lock(&u->readlock);
 535         sk->sk_peek_off = val;
 536         mutex_unlock(&u->readlock);
 537 }
 538
 539
 540 static const struct proto_ops unix_stream_ops = {
 541         .family =       PF_UNIX,
 542         .owner =        THIS_MODULE,
 543         .release =      unix_release,
 544         .bind =         unix_bind,
 545         .connect =      unix_stream_connect,
 546         .socketpair =   unix_socketpair,
 547         .accept =       unix_accept,
 548         .getname =      unix_getname,
 549         .poll =         unix_poll,
 550         .ioctl =        unix_ioctl,
 551         .listen =       unix_listen,
 552         .shutdown =     unix_shutdown,
 553         .setsockopt =   sock_no_setsockopt,
 554         .getsockopt =   sock_no_getsockopt,
 555         .sendmsg =      unix_stream_sendmsg,
 556         .recvmsg =      unix_stream_recvmsg,
 557         .mmap =         sock_no_mmap,
 558         .sendpage =     sock_no_sendpage,
 559         .set_peek_off = unix_set_peek_off,
 560 };
 561
 562 static const struct proto_ops unix_dgram_ops = {
 563         .family =       PF_UNIX,
 564         .owner =        THIS_MODULE,
 565         .release =      unix_release,
 566         .bind =         unix_bind,
 567         .connect =      unix_dgram_connect,
 568         .socketpair =   unix_socketpair,
 569         .accept =       sock_no_accept,
 570         .getname =      unix_getname,
 571         .poll =         unix_dgram_poll,
 572         .ioctl =        unix_ioctl,
 573         .listen =       sock_no_listen,
 574         .shutdown =     unix_shutdown,
 575         .setsockopt =   sock_no_setsockopt,
 576         .getsockopt =   sock_no_getsockopt,
 577         .sendmsg =      unix_dgram_sendmsg,
 578         .recvmsg =      unix_dgram_recvmsg,
 579         .mmap =         sock_no_mmap,
 580         .sendpage =     sock_no_sendpage,
 581         .set_peek_off = unix_set_peek_off,
 582 };
 583
 584 static const struct proto_ops unix_seqpacket_ops = {
 585         .family =       PF_UNIX,
 586         .owner =        THIS_MODULE,
 587         .release =      unix_release,
 588         .bind =         unix_bind,
 589         .connect =      unix_stream_connect,
 590         .socketpair =   unix_socketpair,
 591         .accept =       unix_accept,
 592         .getname =      unix_getname,
 593         .poll =         unix_dgram_poll,
 594         .ioctl =        unix_ioctl,
 595         .listen =       unix_listen,
 596         .shutdown =     unix_shutdown,
 597         .setsockopt =   sock_no_setsockopt,
 598         .getsockopt =   sock_no_getsockopt,
 599         .sendmsg =      unix_seqpacket_sendmsg,
 600         .recvmsg =      unix_seqpacket_recvmsg,
 601         .mmap =         sock_no_mmap,
 602         .sendpage =     sock_no_sendpage,
 603         .set_peek_off = unix_set_peek_off,
 604 };
 605
 606 static struct proto unix_proto = {
 607         .name                   = "UNIX",
 608         .owner                  = THIS_MODULE,
 609         .obj_size               = sizeof(struct unix_sock),
 610 };
 611
 612 /*
 613  * AF_UNIX sockets do not interact with hardware, hence they
 614  * dont trigger interrupts - so it's safe for them to have
 615  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 616  * this special lock-class by reinitializing the spinlock key:
 617  */
 618 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 619
 620 static struct sock *unix_create1(struct net *net, struct socket *sock)
 621 {
 622         struct sock *sk = NULL;
 623         struct unix_sock *u;
 624
 625         atomic_long_inc(&unix_nr_socks);
 626         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 627                 goto out;
 628
 629         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
 630         if (!sk)
 631                 goto out;
 632
 633         sock_init_data(sock, sk);
 634         lockdep_set_class(&sk->sk_receive_queue.lock,
 635                                 &af_unix_sk_receive_queue_lock_key);
 636
 637         sk->sk_write_space      = unix_write_space;
 638         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 639         sk->sk_destruct         = unix_sock_destructor;
 640         u         = unix_sk(sk);
 641         u->path.dentry = NULL;
 642         u->path.mnt = NULL;
 643         spin_lock_init(&u->lock);
 644         atomic_long_set(&u->inflight, 0);
 645         INIT_LIST_HEAD(&u->link);
 646         mutex_init(&u->readlock); /* single task reading lock */
 647         init_waitqueue_head(&u->peer_wait);
 648         unix_insert_socket(unix_sockets_unbound, sk);
 649 out:
 650         if (sk == NULL)
 651                 atomic_long_dec(&unix_nr_socks);
 652         else {
 653                 local_bh_disable();
 654                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 655                 local_bh_enable();
 656         }
 657         return sk;
 658 }
 659
 660 static int unix_create(struct net *net, struct socket *sock, int protocol,
 661                        int kern)
 662 {
 663         if (protocol && protocol != PF_UNIX)
 664                 return -EPROTONOSUPPORT;
 665
 666         sock->state = SS_UNCONNECTED;
 667
 668         switch (sock->type) {
 669         case SOCK_STREAM:
 670                 sock->ops = &unix_stream_ops;
 671                 break;
 672                 /*
 673                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 674                  *      nothing uses it.
 675                  */
 676         case SOCK_RAW:
 677                 sock->type = SOCK_DGRAM;
 678         case SOCK_DGRAM:
 679                 sock->ops = &unix_dgram_ops;
 680                 break;
 681         case SOCK_SEQPACKET:
 682                 sock->ops = &unix_seqpacket_ops;
 683                 break;
 684         default:
 685                 return -ESOCKTNOSUPPORT;
 686         }
 687
 688         return unix_create1(net, sock) ? 0 : -ENOMEM;
 689 }
 690
 691 static int unix_release(struct socket *sock)
 692 {
 693         struct sock *sk = sock->sk;
 694
 695         if (!sk)
 696                 return 0;
 697
 698         sock->sk = NULL;
 699
 700         return unix_release_sock(sk, 0);
 701 }
 702
 703 static int unix_autobind(struct socket *sock)
 704 {
 705         struct sock *sk = sock->sk;
 706         struct net *net = sock_net(sk);
 707         struct unix_sock *u = unix_sk(sk);
 708         static u32 ordernum = 1;
 709         struct unix_address *addr;
 710         int err;
 711         unsigned int retries = 0;
 712
 713         mutex_lock(&u->readlock);
 714
 715         err = 0;
 716         if (u->addr)
 717                 goto out;
 718
 719         err = -ENOMEM;
 720         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 721         if (!addr)
 722                 goto out;
 723
 724         addr->name->sun_family = AF_UNIX;
 725         atomic_set(&addr->refcnt, 1);
 726
 727 retry:
 728         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 729         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 730
 731         spin_lock(&unix_table_lock);
 732         ordernum = (ordernum+1)&0xFFFFF;
 733
 734         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 735                                       addr->hash)) {
 736                 spin_unlock(&unix_table_lock);
 737                 /*
 738                  * __unix_find_socket_byname() may take long time if many names
 739                  * are already in use.
 740                  */
 741                 cond_resched();
 742                 /* Give up if all names seems to be in use. */
 743                 if (retries++ == 0xFFFFF) {
 744                         err = -ENOSPC;
 745                         kfree(addr);
 746                         goto out;
 747                 }
 748                 goto retry;
 749         }
 750         addr->hash ^= sk->sk_type;
 751
 752         __unix_remove_socket(sk);
 753         u->addr = addr;
 754         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 755         spin_unlock(&unix_table_lock);
 756         err = 0;
 757
 758 out:    mutex_unlock(&u->readlock);
 759         return err;
 760 }
 761
 762 static struct sock *unix_find_other(struct net *net,
 763                                     struct sockaddr_un *sunname, int len,
 764                                     int type, unsigned int hash, int *error)
 765 {
 766         struct sock *u;
 767         struct path path;
 768         int err = 0;
 769
 770         if (sunname->sun_path[0]) {
 771                 struct inode *inode;
 772                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 773                 if (err)
 774                         goto fail;
 775                 inode = path.dentry->d_inode;
 776                 err = inode_permission(inode, MAY_WRITE);
 777                 if (err)
 778                         goto put_fail;
 779
 780                 err = -ECONNREFUSED;
 781                 if (!S_ISSOCK(inode->i_mode))
 782                         goto put_fail;
 783                 u = unix_find_socket_byinode(inode);
 784                 if (!u)
 785                         goto put_fail;
 786
 787                 if (u->sk_type == type)
 788                         touch_atime(&path);
 789
 790                 path_put(&path);
 791
 792                 err = -EPROTOTYPE;
 793                 if (u->sk_type != type) {
 794                         sock_put(u);
 795                         goto fail;
 796                 }
 797         } else {
 798                 err = -ECONNREFUSED;
 799                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 800                 if (u) {
 801                         struct dentry *dentry;
 802                         dentry = unix_sk(u)->path.dentry;
 803                         if (dentry)
 804                                 touch_atime(&unix_sk(u)->path);
 805                 } else
 806                         goto fail;
 807         }
 808         return u;
 809
 810 put_fail:
 811         path_put(&path);
 812 fail:
 813         *error = err;
 814         return NULL;
 815 }
 816
 817
 818 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 819 {
 820         struct sock *sk = sock->sk;
 821         struct net *net = sock_net(sk);
 822         struct unix_sock *u = unix_sk(sk);
 823         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 824         char *sun_path = sunaddr->sun_path;
 825         struct dentry *dentry = NULL;
 826         struct path path;
 827         int err;
 828         unsigned int hash;
 829         struct unix_address *addr;
 830         struct hlist_head *list;
 831
 832         err = -EINVAL;
 833         if (sunaddr->sun_family != AF_UNIX)
 834                 goto out;
 835
 836         if (addr_len == sizeof(short)) {
 837                 err = unix_autobind(sock);
 838                 goto out;
 839         }
 840
 841         err = unix_mkname(sunaddr, addr_len, &hash);
 842         if (err < 0)
 843                 goto out;
 844         addr_len = err;
 845
 846         mutex_lock(&u->readlock);
 847
 848         err = -EINVAL;
 849         if (u->addr)
 850                 goto out_up;
 851
 852         err = -ENOMEM;
 853         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 854         if (!addr)
 855                 goto out_up;
 856
 857         memcpy(addr->name, sunaddr, addr_len);
 858         addr->len = addr_len;
 859         addr->hash = hash ^ sk->sk_type;
 860         atomic_set(&addr->refcnt, 1);
 861
 862         if (sun_path[0]) {
 863                 umode_t mode;
 864                 err = 0;
 865                 /*
 866                  * Get the parent directory, calculate the hash for last
 867                  * component.
 868                  */
 869                 dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 870                 err = PTR_ERR(dentry);
 871                 if (IS_ERR(dentry))
 872                         goto out_mknod_parent;
 873
 874                 /*
 875                  * All right, let's create it.
 876                  */
 877                 mode = S_IFSOCK |
 878                        (SOCK_INODE(sock)->i_mode & ~current_umask());
 879                 err = mnt_want_write(path.mnt);
 880                 if (err)
 881                         goto out_mknod_dput;
 882                 err = security_path_mknod(&path, dentry, mode, 0);
 883                 if (err)
 884                         goto out_mknod_drop_write;
 885                 err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
 886 out_mknod_drop_write:
 887                 mnt_drop_write(path.mnt);
 888                 if (err)
 889                         goto out_mknod_dput;
 890                 mutex_unlock(&path.dentry->d_inode->i_mutex);
 891                 dput(path.dentry);
 892                 path.dentry = dentry;
 893
 894                 addr->hash = UNIX_HASH_SIZE;
 895         }
 896
 897         spin_lock(&unix_table_lock);
 898
 899         if (!sun_path[0]) {
 900                 err = -EADDRINUSE;
 901                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
 902                                               sk->sk_type, hash)) {
 903                         unix_release_addr(addr);
 904                         goto out_unlock;
 905                 }
 906
 907                 list = &unix_socket_table[addr->hash];
 908         } else {
 909                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
 910                 u->path = path;
 911         }
 912
 913         err = 0;
 914         __unix_remove_socket(sk);
 915         u->addr = addr;
 916         __unix_insert_socket(list, sk);
 917
 918 out_unlock:
 919         spin_unlock(&unix_table_lock);
 920 out_up:
 921         mutex_unlock(&u->readlock);
 922 out:
 923         return err;
 924
 925 out_mknod_dput:
 926         dput(dentry);
 927         mutex_unlock(&path.dentry->d_inode->i_mutex);
 928         path_put(&path);
 929 out_mknod_parent:
 930         if (err == -EEXIST)
 931                 err = -EADDRINUSE;
 932         unix_release_addr(addr);
 933         goto out_up;
 934 }
 935
 936 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
 937 {
 938         if (unlikely(sk1 == sk2) || !sk2) {
 939                 unix_state_lock(sk1);
 940                 return;
 941         }
 942         if (sk1 < sk2) {
 943                 unix_state_lock(sk1);
 944                 unix_state_lock_nested(sk2);
 945         } else {
 946                 unix_state_lock(sk2);
 947                 unix_state_lock_nested(sk1);
 948         }
 949 }
 950
 951 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 952 {
 953         if (unlikely(sk1 == sk2) || !sk2) {
 954                 unix_state_unlock(sk1);
 955                 return;
 956         }
 957         unix_state_unlock(sk1);
 958         unix_state_unlock(sk2);
 959 }
 960
 961 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 962                               int alen, int flags)
 963 {
 964         struct sock *sk = sock->sk;
 965         struct net *net = sock_net(sk);
 966         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 967         struct sock *other;
 968         unsigned int hash;
 969         int err;
 970
 971         if (addr->sa_family != AF_UNSPEC) {
 972                 err = unix_mkname(sunaddr, alen, &hash);
 973                 if (err < 0)
 974                         goto out;
 975                 alen = err;
 976
 977                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 978                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
 979                         goto out;
 980
 981 restart:
 982                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
 983                 if (!other)
 984                         goto out;
 985
 986                 unix_state_double_lock(sk, other);
 987
 988                 /* Apparently VFS overslept socket death. Retry. */
 989                 if (sock_flag(other, SOCK_DEAD)) {
 990                         unix_state_double_unlock(sk, other);
 991                         sock_put(other);
 992                         goto restart;
 993                 }
 994
 995                 err = -EPERM;
 996                 if (!unix_may_send(sk, other))
 997                         goto out_unlock;
 998
 999                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1000                 if (err)
1001                         goto out_unlock;
1002
1003         } else {
1004                 /*
1005                  *      1003.1g breaking connected state with AF_UNSPEC
1006                  */
1007                 other = NULL;
1008                 unix_state_double_lock(sk, other);
1009         }
1010
1011         /*
1012          * If it was connected, reconnect.
1013          */
1014         if (unix_peer(sk)) {
1015                 struct sock *old_peer = unix_peer(sk);
1016                 unix_peer(sk) = other;
1017                 unix_state_double_unlock(sk, other);
1018
1019                 if (other != old_peer)
1020                         unix_dgram_disconnected(sk, old_peer);
1021                 sock_put(old_peer);
1022         } else {
1023                 unix_peer(sk) = other;
1024                 unix_state_double_unlock(sk, other);
1025         }
1026         return 0;
1027
1028 out_unlock:
1029         unix_state_double_unlock(sk, other);
1030         sock_put(other);
1031 out:
1032         return err;
1033 }
1034
1035 static long unix_wait_for_peer(struct sock *other, long timeo)
1036 {
1037         struct unix_sock *u = unix_sk(other);
1038         int sched;
1039         DEFINE_WAIT(wait);
1040
1041         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1042
1043         sched = !sock_flag(other, SOCK_DEAD) &&
1044                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1045                 unix_recvq_full(other);
1046
1047         unix_state_unlock(other);
1048
1049         if (sched)
1050                 timeo = schedule_timeout(timeo);
1051
1052         finish_wait(&u->peer_wait, &wait);
1053         return timeo;
1054 }
1055
1056 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1057                                int addr_len, int flags)
1058 {
1059         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1060         struct sock *sk = sock->sk;
1061         struct net *net = sock_net(sk);
1062         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1063         struct sock *newsk = NULL;
1064         struct sock *other = NULL;
1065         struct sk_buff *skb = NULL;
1066         unsigned int hash;
1067         int st;
1068         int err;
1069         long timeo;
1070
1071         err = unix_mkname(sunaddr, addr_len, &hash);
1072         if (err < 0)
1073                 goto out;
1074         addr_len = err;
1075
1076         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1077             (err = unix_autobind(sock)) != 0)
1078                 goto out;
1079
1080         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1081
1082         /* First of all allocate resources.
1083            If we will make it after state is locked,
1084            we will have to recheck all again in any case.
1085          */
1086
1087         err = -ENOMEM;
1088
1089         /* create new sock for complete connection */
1090         newsk = unix_create1(sock_net(sk), NULL);
1091         if (newsk == NULL)
1092                 goto out;
1093
1094         /* Allocate skb for sending to listening sock */
1095         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1096         if (skb == NULL)
1097                 goto out;
1098
1099 restart:
1100         /*  Find listening sock. */
1101         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1102         if (!other)
1103                 goto out;
1104
1105         /* Latch state of peer */
1106         unix_state_lock(other);
1107
1108         /* Apparently VFS overslept socket death. Retry. */
1109         if (sock_flag(other, SOCK_DEAD)) {
1110                 unix_state_unlock(other);
1111                 sock_put(other);
1112                 goto restart;
1113         }
1114
1115         err = -ECONNREFUSED;
1116         if (other->sk_state != TCP_LISTEN)
1117                 goto out_unlock;
1118         if (other->sk_shutdown & RCV_SHUTDOWN)
1119                 goto out_unlock;
1120
1121         if (unix_recvq_full(other)) {
1122                 err = -EAGAIN;
1123                 if (!timeo)
1124                         goto out_unlock;
1125
1126                 timeo = unix_wait_for_peer(other, timeo);
1127
1128                 err = sock_intr_errno(timeo);
1129                 if (signal_pending(current))
1130                         goto out;
1131                 sock_put(other);
1132                 goto restart;
1133         }
1134
1135         /* Latch our state.
1136
1137            It is tricky place. We need to grab our state lock and cannot
1138            drop lock on peer. It is dangerous because deadlock is
1139            possible. Connect to self case and simultaneous
1140            attempt to connect are eliminated by checking socket
1141            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1142            check this before attempt to grab lock.
1143
1144            Well, and we have to recheck the state after socket locked.
1145          */
1146         st = sk->sk_state;
1147
1148         switch (st) {
1149         case TCP_CLOSE:
1150                 /* This is ok... continue with connect */
1151                 break;
1152         case TCP_ESTABLISHED:
1153                 /* Socket is already connected */
1154                 err = -EISCONN;
1155                 goto out_unlock;
1156         default:
1157                 err = -EINVAL;
1158                 goto out_unlock;
1159         }
1160
1161         unix_state_lock_nested(sk);
1162
1163         if (sk->sk_state != st) {
1164                 unix_state_unlock(sk);
1165                 unix_state_unlock(other);
1166                 sock_put(other);
1167                 goto restart;
1168         }
1169
1170         err = security_unix_stream_connect(sk, other, newsk);
1171         if (err) {
1172                 unix_state_unlock(sk);
1173                 goto out_unlock;
1174         }
1175
1176         /* The way is open! Fastly set all the necessary fields... */
1177
1178         sock_hold(sk);
1179         unix_peer(newsk)        = sk;
1180         newsk->sk_state         = TCP_ESTABLISHED;
1181         newsk->sk_type          = sk->sk_type;
1182         init_peercred(newsk);
1183         newu = unix_sk(newsk);
1184         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1185         otheru = unix_sk(other);
1186
1187         /* copy address information from listening to new sock*/
1188         if (otheru->addr) {
1189                 atomic_inc(&otheru->addr->refcnt);
1190                 newu->addr = otheru->addr;
1191         }
1192         if (otheru->path.dentry) {
1193                 path_get(&otheru->path);
1194                 newu->path = otheru->path;
1195         }
1196
1197         /* Set credentials */
1198         copy_peercred(sk, other);
1199
1200         sock->state     = SS_CONNECTED;
1201         sk->sk_state    = TCP_ESTABLISHED;
1202         sock_hold(newsk);
1203
1204         smp_mb__after_atomic_inc();     /* sock_hold() does an atomic_inc() */
1205         unix_peer(sk)   = newsk;
1206
1207         unix_state_unlock(sk);
1208
1209         /* take ten and and send info to listening sock */
1210         spin_lock(&other->sk_receive_queue.lock);
1211         __skb_queue_tail(&other->sk_receive_queue, skb);
1212         spin_unlock(&other->sk_receive_queue.lock);
1213         unix_state_unlock(other);
1214         other->sk_data_ready(other, 0);
1215         sock_put(other);
1216         return 0;
1217
1218 out_unlock:
1219         if (other)
1220                 unix_state_unlock(other);
1221
1222 out:
1223         kfree_skb(skb);
1224         if (newsk)
1225                 unix_release_sock(newsk, 0);
1226         if (other)
1227                 sock_put(other);
1228         return err;
1229 }
1230
1231 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1232 {
1233         struct sock *ska = socka->sk, *skb = sockb->sk;
1234
1235         /* Join our sockets back to back */
1236         sock_hold(ska);
1237         sock_hold(skb);
1238         unix_peer(ska) = skb;
1239         unix_peer(skb) = ska;
1240         init_peercred(ska);
1241         init_peercred(skb);
1242
1243         if (ska->sk_type != SOCK_DGRAM) {
1244                 ska->sk_state = TCP_ESTABLISHED;
1245                 skb->sk_state = TCP_ESTABLISHED;
1246                 socka->state  = SS_CONNECTED;
1247                 sockb->state  = SS_CONNECTED;
1248         }
1249         return 0;
1250 }
1251
1252 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1253 {
1254         struct sock *sk = sock->sk;
1255         struct sock *tsk;
1256         struct sk_buff *skb;
1257         int err;
1258
1259         err = -EOPNOTSUPP;
1260         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1261                 goto out;
1262
1263         err = -EINVAL;
1264         if (sk->sk_state != TCP_LISTEN)
1265                 goto out;
1266
1267         /* If socket state is TCP_LISTEN it cannot change (for now...),
1268          * so that no locks are necessary.
1269          */
1270
1271         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1272         if (!skb) {
1273                 /* This means receive shutdown. */
1274                 if (err == 0)
1275                         err = -EINVAL;
1276                 goto out;
1277         }
1278
1279         tsk = skb->sk;
1280         skb_free_datagram(sk, skb);
1281         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1282
1283         /* attach accepted sock to socket */
1284         unix_state_lock(tsk);
1285         newsock->state = SS_CONNECTED;
1286         sock_graft(tsk, newsock);
1287         unix_state_unlock(tsk);
1288         return 0;
1289
1290 out:
1291         return err;
1292 }
1293
1294
1295 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1296 {
1297         struct sock *sk = sock->sk;
1298         struct unix_sock *u;
1299         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1300         int err = 0;
1301
1302         if (peer) {
1303                 sk = unix_peer_get(sk);
1304
1305                 err = -ENOTCONN;
1306                 if (!sk)
1307                         goto out;
1308                 err = 0;
1309         } else {
1310                 sock_hold(sk);
1311         }
1312
1313         u = unix_sk(sk);
1314         unix_state_lock(sk);
1315         if (!u->addr) {
1316                 sunaddr->sun_family = AF_UNIX;
1317                 sunaddr->sun_path[0] = 0;
1318                 *uaddr_len = sizeof(short);
1319         } else {
1320                 struct unix_address *addr = u->addr;
1321
1322                 *uaddr_len = addr->len;
1323                 memcpy(sunaddr, addr->name, *uaddr_len);
1324         }
1325         unix_state_unlock(sk);
1326         sock_put(sk);
1327 out:
1328         return err;
1329 }
1330
1331 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1332 {
1333         int i;
1334
1335         scm->fp = UNIXCB(skb).fp;
1336         UNIXCB(skb).fp = NULL;
1337
1338         for (i = scm->fp->count-1; i >= 0; i--)
1339                 unix_notinflight(scm->fp->fp[i]);
1340 }
1341
1342 static void unix_destruct_scm(struct sk_buff *skb)
1343 {
1344         struct scm_cookie scm;
1345         memset(&scm, 0, sizeof(scm));
1346         scm.pid  = UNIXCB(skb).pid;
1347         scm.cred = UNIXCB(skb).cred;
1348         if (UNIXCB(skb).fp)
1349                 unix_detach_fds(&scm, skb);
1350
1351         /* Alas, it calls VFS */
1352         /* So fscking what? fput() had been SMP-safe since the last Summer */
1353         scm_destroy(&scm);
1354         sock_wfree(skb);
1355 }
1356
1357 #define MAX_RECURSION_LEVEL 4
1358
1359 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1360 {
1361         int i;
1362         unsigned char max_level = 0;
1363         int unix_sock_count = 0;
1364
1365         for (i = scm->fp->count - 1; i >= 0; i--) {
1366                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1367
1368                 if (sk) {
1369                         unix_sock_count++;
1370                         max_level = max(max_level,
1371                                         unix_sk(sk)->recursion_level);
1372                 }
1373         }
1374         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1375                 return -ETOOMANYREFS;
1376
1377         /*
1378          * Need to duplicate file references for the sake of garbage
1379          * collection.  Otherwise a socket in the fps might become a
1380          * candidate for GC while the skb is not yet queued.
1381          */
1382         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1383         if (!UNIXCB(skb).fp)
1384                 return -ENOMEM;
1385
1386         if (unix_sock_count) {
1387                 for (i = scm->fp->count - 1; i >= 0; i--)
1388                         unix_inflight(scm->fp->fp[i]);
1389         }
1390         return max_level;
1391 }
1392
1393 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1394 {
1395         int err = 0;
1396
1397         UNIXCB(skb).pid  = get_pid(scm->pid);
1398         if (scm->cred)
1399                 UNIXCB(skb).cred = get_cred(scm->cred);
1400         UNIXCB(skb).fp = NULL;
1401         if (scm->fp && send_fds)
1402                 err = unix_attach_fds(scm, skb);
1403
1404         skb->destructor = unix_destruct_scm;
1405         return err;
1406 }
1407
1408 /*
1409  * Some apps rely on write() giving SCM_CREDENTIALS
1410  * We include credentials if source or destination socket
1411  * asserted SOCK_PASSCRED.
1412  */
1413 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1414                             const struct sock *other)
1415 {
1416         if (UNIXCB(skb).cred)
1417                 return;
1418         if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1419             !other->sk_socket ||
1420             test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1421                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1422                 UNIXCB(skb).cred = get_current_cred();
1423         }
1424 }
1425
1426 /*
1427  *      Send AF_UNIX data.
1428  */
1429
1430 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1431                               struct msghdr *msg, size_t len)
1432 {
1433         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1434         struct sock *sk = sock->sk;
1435         struct net *net = sock_net(sk);
1436         struct unix_sock *u = unix_sk(sk);
1437         struct sockaddr_un *sunaddr = msg->msg_name;
1438         struct sock *other = NULL;
1439         int namelen = 0; /* fake GCC */
1440         int err;
1441         unsigned int hash;
1442         struct sk_buff *skb;
1443         long timeo;
1444         struct scm_cookie tmp_scm;
1445         int max_level;
1446         int data_len = 0;
1447
1448         if (NULL == siocb->scm)
1449                 siocb->scm = &tmp_scm;
1450         wait_for_unix_gc();
1451         err = scm_send(sock, msg, siocb->scm);
1452         if (err < 0)
1453                 return err;
1454
1455         err = -EOPNOTSUPP;
1456         if (msg->msg_flags&MSG_OOB)
1457                 goto out;
1458
1459         if (msg->msg_namelen) {
1460                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1461                 if (err < 0)
1462                         goto out;
1463                 namelen = err;
1464         } else {
1465                 sunaddr = NULL;
1466                 err = -ENOTCONN;
1467                 other = unix_peer_get(sk);
1468                 if (!other)
1469                         goto out;
1470         }
1471
1472         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1473             && (err = unix_autobind(sock)) != 0)
1474                 goto out;
1475
1476         err = -EMSGSIZE;
1477         if (len > sk->sk_sndbuf - 32)
1478                 goto out;
1479
1480         if (len > SKB_MAX_ALLOC)
1481                 data_len = min_t(size_t,
1482                                  len - SKB_MAX_ALLOC,
1483                                  MAX_SKB_FRAGS * PAGE_SIZE);
1484
1485         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1486                                    msg->msg_flags & MSG_DONTWAIT, &err);
1487         if (skb == NULL)
1488                 goto out;
1489
1490         err = unix_scm_to_skb(siocb->scm, skb, true);
1491         if (err < 0)
1492                 goto out_free;
1493         max_level = err + 1;
1494         unix_get_secdata(siocb->scm, skb);
1495
1496         skb_put(skb, len - data_len);
1497         skb->data_len = data_len;
1498         skb->len = len;
1499         err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1500         if (err)
1501                 goto out_free;
1502
1503         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1504
1505 restart:
1506         if (!other) {
1507                 err = -ECONNRESET;
1508                 if (sunaddr == NULL)
1509                         goto out_free;
1510
1511                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1512                                         hash, &err);
1513                 if (other == NULL)
1514                         goto out_free;
1515         }
1516
1517         if (sk_filter(other, skb) < 0) {
1518                 /* Toss the packet but do not return any error to the sender */
1519                 err = len;
1520                 goto out_free;
1521         }
1522
1523         unix_state_lock(other);
1524         err = -EPERM;
1525         if (!unix_may_send(sk, other))
1526                 goto out_unlock;
1527
1528         if (sock_flag(other, SOCK_DEAD)) {
1529                 /*
1530                  *      Check with 1003.1g - what should
1531                  *      datagram error
1532                  */
1533                 unix_state_unlock(other);
1534                 sock_put(other);
1535
1536                 err = 0;
1537                 unix_state_lock(sk);
1538                 if (unix_peer(sk) == other) {
1539                         unix_peer(sk) = NULL;
1540                         unix_state_unlock(sk);
1541
1542                         unix_dgram_disconnected(sk, other);
1543                         sock_put(other);
1544                         err = -ECONNREFUSED;
1545                 } else {
1546                         unix_state_unlock(sk);
1547                 }
1548
1549                 other = NULL;
1550                 if (err)
1551                         goto out_free;
1552                 goto restart;
1553         }
1554
1555         err = -EPIPE;
1556         if (other->sk_shutdown & RCV_SHUTDOWN)
1557                 goto out_unlock;
1558
1559         if (sk->sk_type != SOCK_SEQPACKET) {
1560                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1561                 if (err)
1562                         goto out_unlock;
1563         }
1564
1565         if (unix_peer(other) != sk && unix_recvq_full(other)) {
1566                 if (!timeo) {
1567                         err = -EAGAIN;
1568                         goto out_unlock;
1569                 }
1570
1571                 timeo = unix_wait_for_peer(other, timeo);
1572
1573                 err = sock_intr_errno(timeo);
1574                 if (signal_pending(current))
1575                         goto out_free;
1576
1577                 goto restart;
1578         }
1579
1580         if (sock_flag(other, SOCK_RCVTSTAMP))
1581                 __net_timestamp(skb);
1582         maybe_add_creds(skb, sock, other);
1583         skb_queue_tail(&other->sk_receive_queue, skb);
1584         if (max_level > unix_sk(other)->recursion_level)
1585                 unix_sk(other)->recursion_level = max_level;
1586         unix_state_unlock(other);
1587         other->sk_data_ready(other, len);
1588         sock_put(other);
1589         scm_destroy(siocb->scm);
1590         return len;
1591
1592 out_unlock:
1593         unix_state_unlock(other);
1594 out_free:
1595         kfree_skb(skb);
1596 out:
1597         if (other)
1598                 sock_put(other);
1599         scm_destroy(siocb->scm);
1600         return err;
1601 }
1602
1603
1604 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1605                                struct msghdr *msg, size_t len)
1606 {
1607         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1608         struct sock *sk = sock->sk;
1609         struct sock *other = NULL;
1610         int err, size;
1611         struct sk_buff *skb;
1612         int sent = 0;
1613         struct scm_cookie tmp_scm;
1614         bool fds_sent = false;
1615         int max_level;
1616
1617         if (NULL == siocb->scm)
1618                 siocb->scm = &tmp_scm;
1619         wait_for_unix_gc();
1620         err = scm_send(sock, msg, siocb->scm);
1621         if (err < 0)
1622                 return err;
1623
1624         err = -EOPNOTSUPP;
1625         if (msg->msg_flags&MSG_OOB)
1626                 goto out_err;
1627
1628         if (msg->msg_namelen) {
1629                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1630                 goto out_err;
1631         } else {
1632                 err = -ENOTCONN;
1633                 other = unix_peer(sk);
1634                 if (!other)
1635                         goto out_err;
1636         }
1637
1638         if (sk->sk_shutdown & SEND_SHUTDOWN)
1639                 goto pipe_err;
1640
1641         while (sent < len) {
1642                 /*
1643                  *      Optimisation for the fact that under 0.01% of X
1644                  *      messages typically need breaking up.
1645                  */
1646
1647                 size = len-sent;
1648
1649                 /* Keep two messages in the pipe so it schedules better */
1650                 if (size > ((sk->sk_sndbuf >> 1) - 64))
1651                         size = (sk->sk_sndbuf >> 1) - 64;
1652
1653                 if (size > SKB_MAX_ALLOC)
1654                         size = SKB_MAX_ALLOC;
1655
1656                 /*
1657                  *      Grab a buffer
1658                  */
1659
1660                 skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
1661                                           &err);
1662
1663                 if (skb == NULL)
1664                         goto out_err;
1665
1666                 /*
1667                  *      If you pass two values to the sock_alloc_send_skb
1668                  *      it tries to grab the large buffer with GFP_NOFS
1669                  *      (which can fail easily), and if it fails grab the
1670                  *      fallback size buffer which is under a page and will
1671                  *      succeed. [Alan]
1672                  */
1673                 size = min_t(int, size, skb_tailroom(skb));
1674
1675
1676                 /* Only send the fds in the first buffer */
1677                 err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1678                 if (err < 0) {
1679                         kfree_skb(skb);
1680                         goto out_err;
1681                 }
1682                 max_level = err + 1;
1683                 fds_sent = true;
1684
1685                 err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
1686                 if (err) {
1687                         kfree_skb(skb);
1688                         goto out_err;
1689                 }
1690
1691                 unix_state_lock(other);
1692
1693                 if (sock_flag(other, SOCK_DEAD) ||
1694                     (other->sk_shutdown & RCV_SHUTDOWN))
1695                         goto pipe_err_free;
1696
1697                 maybe_add_creds(skb, sock, other);
1698                 skb_queue_tail(&other->sk_receive_queue, skb);
1699                 if (max_level > unix_sk(other)->recursion_level)
1700                         unix_sk(other)->recursion_level = max_level;
1701                 unix_state_unlock(other);
1702                 other->sk_data_ready(other, size);
1703                 sent += size;
1704         }
1705
1706         scm_destroy(siocb->scm);
1707         siocb->scm = NULL;
1708
1709         return sent;
1710
1711 pipe_err_free:
1712         unix_state_unlock(other);
1713         kfree_skb(skb);
1714 pipe_err:
1715         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1716                 send_sig(SIGPIPE, current, 0);
1717         err = -EPIPE;
1718 out_err:
1719         scm_destroy(siocb->scm);
1720         siocb->scm = NULL;
1721         return sent ? : err;
1722 }
1723
1724 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1725                                   struct msghdr *msg, size_t len)
1726 {
1727         int err;
1728         struct sock *sk = sock->sk;
1729
1730         err = sock_error(sk);
1731         if (err)
1732                 return err;
1733
1734         if (sk->sk_state != TCP_ESTABLISHED)
1735                 return -ENOTCONN;
1736
1737         if (msg->msg_namelen)
1738                 msg->msg_namelen = 0;
1739
1740         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1741 }
1742
1743 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1744                               struct msghdr *msg, size_t size,
1745                               int flags)
1746 {
1747         struct sock *sk = sock->sk;
1748
1749         if (sk->sk_state != TCP_ESTABLISHED)
1750                 return -ENOTCONN;
1751
1752         return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1753 }
1754
1755 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1756 {
1757         struct unix_sock *u = unix_sk(sk);
1758
1759         msg->msg_namelen = 0;
1760         if (u->addr) {
1761                 msg->msg_namelen = u->addr->len;
1762                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1763         }
1764 }
1765
1766 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1767                               struct msghdr *msg, size_t size,
1768                               int flags)
1769 {
1770         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1771         struct scm_cookie tmp_scm;
1772         struct sock *sk = sock->sk;
1773         struct unix_sock *u = unix_sk(sk);
1774         int noblock = flags & MSG_DONTWAIT;
1775         struct sk_buff *skb;
1776         int err;
1777         int peeked, skip;
1778
1779         err = -EOPNOTSUPP;
1780         if (flags&MSG_OOB)
1781                 goto out;
1782
1783         msg->msg_namelen = 0;
1784
1785         err = mutex_lock_interruptible(&u->readlock);
1786         if (err) {
1787                 err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1788                 goto out;
1789         }
1790
1791         skip = sk_peek_offset(sk, flags);
1792
1793         skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1794         if (!skb) {
1795                 unix_state_lock(sk);
1796                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1797                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1798                     (sk->sk_shutdown & RCV_SHUTDOWN))
1799                         err = 0;
1800                 unix_state_unlock(sk);
1801                 goto out_unlock;
1802         }
1803
1804         wake_up_interruptible_sync_poll(&u->peer_wait,
1805                                         POLLOUT | POLLWRNORM | POLLWRBAND);
1806
1807         if (msg->msg_name)
1808                 unix_copy_addr(msg, skb->sk);
1809
1810         if (size > skb->len - skip)
1811                 size = skb->len - skip;
1812         else if (size < skb->len - skip)
1813                 msg->msg_flags |= MSG_TRUNC;
1814
1815         err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1816         if (err)
1817                 goto out_free;
1818
1819         if (sock_flag(sk, SOCK_RCVTSTAMP))
1820                 __sock_recv_timestamp(msg, sk, skb);
1821
1822         if (!siocb->scm) {
1823                 siocb->scm = &tmp_scm;
1824                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1825         }
1826         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
1827         unix_set_secdata(siocb->scm, skb);
1828
1829         if (!(flags & MSG_PEEK)) {
1830                 if (UNIXCB(skb).fp)
1831                         unix_detach_fds(siocb->scm, skb);
1832
1833                 sk_peek_offset_bwd(sk, skb->len);
1834         } else {
1835                 /* It is questionable: on PEEK we could:
1836                    - do not return fds - good, but too simple 8)
1837                    - return fds, and do not return them on read (old strategy,
1838                      apparently wrong)
1839                    - clone fds (I chose it for now, it is the most universal
1840                      solution)
1841
1842                    POSIX 1003.1g does not actually define this clearly
1843                    at all. POSIX 1003.1g doesn't define a lot of things
1844                    clearly however!
1845
1846                 */
1847
1848                 sk_peek_offset_fwd(sk, size);
1849
1850                 if (UNIXCB(skb).fp)
1851                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1852         }
1853         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1854
1855         scm_recv(sock, msg, siocb->scm, flags);
1856
1857 out_free:
1858         skb_free_datagram(sk, skb);
1859 out_unlock:
1860         mutex_unlock(&u->readlock);
1861 out:
1862         return err;
1863 }
1864
1865 /*
1866  *      Sleep until data has arrive. But check for races..
1867  */
1868
1869 static long unix_stream_data_wait(struct sock *sk, long timeo)
1870 {
1871         DEFINE_WAIT(wait);
1872
1873         unix_state_lock(sk);
1874
1875         for (;;) {
1876                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1877
1878                 if (!skb_queue_empty(&sk->sk_receive_queue) ||
1879                     sk->sk_err ||
1880                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1881                     signal_pending(current) ||
1882                     !timeo)
1883                         break;
1884
1885                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1886                 unix_state_unlock(sk);
1887                 timeo = schedule_timeout(timeo);
1888                 unix_state_lock(sk);
1889                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1890         }
1891
1892         finish_wait(sk_sleep(sk), &wait);
1893         unix_state_unlock(sk);
1894         return timeo;
1895 }
1896
1897
1898
1899 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1900                                struct msghdr *msg, size_t size,
1901                                int flags)
1902 {
1903         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1904         struct scm_cookie tmp_scm;
1905         struct sock *sk = sock->sk;
1906         struct unix_sock *u = unix_sk(sk);
1907         struct sockaddr_un *sunaddr = msg->msg_name;
1908         int copied = 0;
1909         int check_creds = 0;
1910         int target;
1911         int err = 0;
1912         long timeo;
1913         int skip;
1914
1915         err = -EINVAL;
1916         if (sk->sk_state != TCP_ESTABLISHED)
1917                 goto out;
1918
1919         err = -EOPNOTSUPP;
1920         if (flags&MSG_OOB)
1921                 goto out;
1922
1923         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1924         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1925
1926         msg->msg_namelen = 0;
1927
1928         /* Lock the socket to prevent queue disordering
1929          * while sleeps in memcpy_tomsg
1930          */
1931
1932         if (!siocb->scm) {
1933                 siocb->scm = &tmp_scm;
1934                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1935         }
1936
1937         err = mutex_lock_interruptible(&u->readlock);
1938         if (err) {
1939                 err = sock_intr_errno(timeo);
1940                 goto out;
1941         }
1942
1943         skip = sk_peek_offset(sk, flags);
1944
1945         do {
1946                 int chunk;
1947                 struct sk_buff *skb;
1948
1949                 unix_state_lock(sk);
1950                 skb = skb_peek(&sk->sk_receive_queue);
1951 again:
1952                 if (skb == NULL) {
1953                         unix_sk(sk)->recursion_level = 0;
1954                         if (copied >= target)
1955                                 goto unlock;
1956
1957                         /*
1958                          *      POSIX 1003.1g mandates this order.
1959                          */
1960
1961                         err = sock_error(sk);
1962                         if (err)
1963                                 goto unlock;
1964                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1965                                 goto unlock;
1966
1967                         unix_state_unlock(sk);
1968                         err = -EAGAIN;
1969                         if (!timeo)
1970                                 break;
1971                         mutex_unlock(&u->readlock);
1972
1973                         timeo = unix_stream_data_wait(sk, timeo);
1974
1975                         if (signal_pending(current)
1976                             ||  mutex_lock_interruptible(&u->readlock)) {
1977                                 err = sock_intr_errno(timeo);
1978                                 goto out;
1979                         }
1980
1981                         continue;
1982  unlock:
1983                         unix_state_unlock(sk);
1984                         break;
1985                 }
1986
1987                 if (skip >= skb->len) {
1988                         skip -= skb->len;
1989                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
1990                         goto again;
1991                 }
1992
1993                 unix_state_unlock(sk);
1994
1995                 if (check_creds) {
1996                         /* Never glue messages from different writers */
1997                         if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1998                             (UNIXCB(skb).cred != siocb->scm->cred))
1999                                 break;
2000                 } else {
2001                         /* Copy credentials */
2002                         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
2003                         check_creds = 1;
2004                 }
2005
2006                 /* Copy address just once */
2007                 if (sunaddr) {
2008                         unix_copy_addr(msg, skb->sk);
2009                         sunaddr = NULL;
2010                 }
2011
2012                 chunk = min_t(unsigned int, skb->len - skip, size);
2013                 if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) {
2014                         if (copied == 0)
2015                                 copied = -EFAULT;
2016                         break;
2017                 }
2018                 copied += chunk;
2019                 size -= chunk;
2020
2021                 /* Mark read part of skb as used */
2022                 if (!(flags & MSG_PEEK)) {
2023                         skb_pull(skb, chunk);
2024
2025                         sk_peek_offset_bwd(sk, chunk);
2026
2027                         if (UNIXCB(skb).fp)
2028                                 unix_detach_fds(siocb->scm, skb);
2029
2030                         if (skb->len)
2031                                 break;
2032
2033                         skb_unlink(skb, &sk->sk_receive_queue);
2034                         consume_skb(skb);
2035
2036                         if (siocb->scm->fp)
2037                                 break;
2038                 } else {
2039                         /* It is questionable, see note in unix_dgram_recvmsg.
2040                          */
2041                         if (UNIXCB(skb).fp)
2042                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2043
2044                         sk_peek_offset_fwd(sk, chunk);
2045
2046                         break;
2047                 }
2048         } while (size);
2049
2050         mutex_unlock(&u->readlock);
2051         scm_recv(sock, msg, siocb->scm, flags);
2052 out:
2053         return copied ? : err;
2054 }
2055
2056 static int unix_shutdown(struct socket *sock, int mode)
2057 {
2058         struct sock *sk = sock->sk;
2059         struct sock *other;
2060
2061         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
2062
2063         if (!mode)
2064                 return 0;
2065
2066         unix_state_lock(sk);
2067         sk->sk_shutdown |= mode;
2068         other = unix_peer(sk);
2069         if (other)
2070                 sock_hold(other);
2071         unix_state_unlock(sk);
2072         sk->sk_state_change(sk);
2073
2074         if (other &&
2075                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2076
2077                 int peer_mode = 0;
2078
2079                 if (mode&RCV_SHUTDOWN)
2080                         peer_mode |= SEND_SHUTDOWN;
2081                 if (mode&SEND_SHUTDOWN)
2082                         peer_mode |= RCV_SHUTDOWN;
2083                 unix_state_lock(other);
2084                 other->sk_shutdown |= peer_mode;
2085                 unix_state_unlock(other);
2086                 other->sk_state_change(other);
2087                 if (peer_mode == SHUTDOWN_MASK)
2088                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2089                 else if (peer_mode & RCV_SHUTDOWN)
2090                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2091         }
2092         if (other)
2093                 sock_put(other);
2094
2095         return 0;
2096 }
2097
2098 long unix_inq_len(struct sock *sk)
2099 {
2100         struct sk_buff *skb;
2101         long amount = 0;
2102
2103         if (sk->sk_state == TCP_LISTEN)
2104                 return -EINVAL;
2105
2106         spin_lock(&sk->sk_receive_queue.lock);
2107         if (sk->sk_type == SOCK_STREAM ||
2108             sk->sk_type == SOCK_SEQPACKET) {
2109                 skb_queue_walk(&sk->sk_receive_queue, skb)
2110                         amount += skb->len;
2111         } else {
2112                 skb = skb_peek(&sk->sk_receive_queue);
2113                 if (skb)
2114                         amount = skb->len;
2115         }
2116         spin_unlock(&sk->sk_receive_queue.lock);
2117
2118         return amount;
2119 }
2120 EXPORT_SYMBOL_GPL(unix_inq_len);
2121
2122 long unix_outq_len(struct sock *sk)
2123 {
2124         return sk_wmem_alloc_get(sk);
2125 }
2126 EXPORT_SYMBOL_GPL(unix_outq_len);
2127
2128 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2129 {
2130         struct sock *sk = sock->sk;
2131         long amount = 0;
2132         int err;
2133
2134         switch (cmd) {
2135         case SIOCOUTQ:
2136                 amount = unix_outq_len(sk);
2137                 err = put_user(amount, (int __user *)arg);
2138                 break;
2139         case SIOCINQ:
2140                 amount = unix_inq_len(sk);
2141                 if (amount < 0)
2142                         err = amount;
2143                 else
2144                         err = put_user(amount, (int __user *)arg);
2145                 break;
2146         default:
2147                 err = -ENOIOCTLCMD;
2148                 break;
2149         }
2150         return err;
2151 }
2152
2153 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2154 {
2155         struct sock *sk = sock->sk;
2156         unsigned int mask;
2157
2158         sock_poll_wait(file, sk_sleep(sk), wait);
2159         mask = 0;
2160
2161         /* exceptional events? */
2162         if (sk->sk_err)
2163                 mask |= POLLERR;
2164         if (sk->sk_shutdown == SHUTDOWN_MASK)
2165                 mask |= POLLHUP;
2166         if (sk->sk_shutdown & RCV_SHUTDOWN)
2167                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2168
2169         /* readable? */
2170         if (!skb_queue_empty(&sk->sk_receive_queue))
2171                 mask |= POLLIN | POLLRDNORM;
2172
2173         /* Connection-based need to check for termination and startup */
2174         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2175             sk->sk_state == TCP_CLOSE)
2176                 mask |= POLLHUP;
2177
2178         /*
2179          * we set writable also when the other side has shut down the
2180          * connection. This prevents stuck sockets.
2181          */
2182         if (unix_writable(sk))
2183                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2184
2185         return mask;
2186 }
2187
2188 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2189                                     poll_table *wait)
2190 {
2191         struct sock *sk = sock->sk, *other;
2192         unsigned int mask, writable;
2193
2194         sock_poll_wait(file, sk_sleep(sk), wait);
2195         mask = 0;
2196
2197         /* exceptional events? */
2198         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2199                 mask |= POLLERR;
2200         if (sk->sk_shutdown & RCV_SHUTDOWN)
2201                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2202         if (sk->sk_shutdown == SHUTDOWN_MASK)
2203                 mask |= POLLHUP;
2204
2205         /* readable? */
2206         if (!skb_queue_empty(&sk->sk_receive_queue))
2207                 mask |= POLLIN | POLLRDNORM;
2208
2209         /* Connection-based need to check for termination and startup */
2210         if (sk->sk_type == SOCK_SEQPACKET) {
2211                 if (sk->sk_state == TCP_CLOSE)
2212                         mask |= POLLHUP;
2213                 /* connection hasn't started yet? */
2214                 if (sk->sk_state == TCP_SYN_SENT)
2215                         return mask;
2216         }
2217
2218         /* No write status requested, avoid expensive OUT tests. */
2219         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2220                 return mask;
2221
2222         writable = unix_writable(sk);
2223         other = unix_peer_get(sk);
2224         if (other) {
2225                 if (unix_peer(other) != sk) {
2226                         sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2227                         if (unix_recvq_full(other))
2228                                 writable = 0;
2229                 }
2230                 sock_put(other);
2231         }
2232
2233         if (writable)
2234                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2235         else
2236                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2237
2238         return mask;
2239 }
2240
2241 #ifdef CONFIG_PROC_FS
2242 static struct sock *first_unix_socket(int *i)
2243 {
2244         for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) {
2245                 if (!hlist_empty(&unix_socket_table[*i]))
2246                         return __sk_head(&unix_socket_table[*i]);
2247         }
2248         return NULL;
2249 }
2250
2251 static struct sock *next_unix_socket(int *i, struct sock *s)
2252 {
2253         struct sock *next = sk_next(s);
2254         /* More in this chain? */
2255         if (next)
2256                 return next;
2257         /* Look for next non-empty chain. */
2258         for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) {
2259                 if (!hlist_empty(&unix_socket_table[*i]))
2260                         return __sk_head(&unix_socket_table[*i]);
2261         }
2262         return NULL;
2263 }
2264
2265 struct unix_iter_state {
2266         struct seq_net_private p;
2267         int i;
2268 };
2269
2270 static struct sock *unix_seq_idx(struct seq_file *seq, loff_t pos)
2271 {
2272         struct unix_iter_state *iter = seq->private;
2273         loff_t off = 0;
2274         struct sock *s;
2275
2276         for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) {
2277                 if (sock_net(s) != seq_file_net(seq))
2278                         continue;
2279                 if (off == pos)
2280                         return s;
2281                 ++off;
2282         }
2283         return NULL;
2284 }
2285
2286 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2287         __acquires(unix_table_lock)
2288 {
2289         spin_lock(&unix_table_lock);
2290         return *pos ? unix_seq_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2291 }
2292
2293 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2294 {
2295         struct unix_iter_state *iter = seq->private;
2296         struct sock *sk = v;
2297         ++*pos;
2298
2299         if (v == SEQ_START_TOKEN)
2300                 sk = first_unix_socket(&iter->i);
2301         else
2302                 sk = next_unix_socket(&iter->i, sk);
2303         while (sk && (sock_net(sk) != seq_file_net(seq)))
2304                 sk = next_unix_socket(&iter->i, sk);
2305         return sk;
2306 }
2307
2308 static void unix_seq_stop(struct seq_file *seq, void *v)
2309         __releases(unix_table_lock)
2310 {
2311         spin_unlock(&unix_table_lock);
2312 }
2313
2314 static int unix_seq_show(struct seq_file *seq, void *v)
2315 {
2316
2317         if (v == SEQ_START_TOKEN)
2318                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2319                          "Inode Path\n");
2320         else {
2321                 struct sock *s = v;
2322                 struct unix_sock *u = unix_sk(s);
2323                 unix_state_lock(s);
2324
2325                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2326                         s,
2327                         atomic_read(&s->sk_refcnt),
2328                         0,
2329                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2330                         s->sk_type,
2331                         s->sk_socket ?
2332                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2333                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2334                         sock_i_ino(s));
2335
2336                 if (u->addr) {
2337                         int i, len;
2338                         seq_putc(seq, ' ');
2339
2340                         i = 0;
2341                         len = u->addr->len - sizeof(short);
2342                         if (!UNIX_ABSTRACT(s))
2343                                 len--;
2344                         else {
2345                                 seq_putc(seq, '@');
2346                                 i++;
2347                         }
2348                         for ( ; i < len; i++)
2349                                 seq_putc(seq, u->addr->name->sun_path[i]);
2350                 }
2351                 unix_state_unlock(s);
2352                 seq_putc(seq, '\n');
2353         }
2354
2355         return 0;
2356 }
2357
2358 static const struct seq_operations unix_seq_ops = {
2359         .start  = unix_seq_start,
2360         .next   = unix_seq_next,
2361         .stop   = unix_seq_stop,
2362         .show   = unix_seq_show,
2363 };
2364
2365 static int unix_seq_open(struct inode *inode, struct file *file)
2366 {
2367         return seq_open_net(inode, file, &unix_seq_ops,
2368                             sizeof(struct unix_iter_state));
2369 }
2370
2371 static const struct file_operations unix_seq_fops = {
2372         .owner          = THIS_MODULE,
2373         .open           = unix_seq_open,
2374         .read           = seq_read,
2375         .llseek         = seq_lseek,
2376         .release        = seq_release_net,
2377 };
2378
2379 #endif
2380
2381 static const struct net_proto_family unix_family_ops = {
2382         .family = PF_UNIX,
2383         .create = unix_create,
2384         .owner  = THIS_MODULE,
2385 };
2386
2387
2388 static int __net_init unix_net_init(struct net *net)
2389 {
2390         int error = -ENOMEM;
2391
2392         net->unx.sysctl_max_dgram_qlen = 10;
2393         if (unix_sysctl_register(net))
2394                 goto out;
2395
2396 #ifdef CONFIG_PROC_FS
2397         if (!proc_net_fops_create(net, "unix", 0, &unix_seq_fops)) {
2398                 unix_sysctl_unregister(net);
2399                 goto out;
2400         }
2401 #endif
2402         error = 0;
2403 out:
2404         return error;
2405 }
2406
2407 static void __net_exit unix_net_exit(struct net *net)
2408 {
2409         unix_sysctl_unregister(net);
2410         proc_net_remove(net, "unix");
2411 }
2412
2413 static struct pernet_operations unix_net_ops = {
2414         .init = unix_net_init,
2415         .exit = unix_net_exit,
2416 };
2417
2418 static int __init af_unix_init(void)
2419 {
2420         int rc = -1;
2421         struct sk_buff *dummy_skb;
2422
2423         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb));
2424
2425         rc = proto_register(&unix_proto, 1);
2426         if (rc != 0) {
2427                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2428                        __func__);
2429                 goto out;
2430         }
2431
2432         sock_register(&unix_family_ops);
2433         register_pernet_subsys(&unix_net_ops);
2434 out:
2435         return rc;
2436 }
2437
2438 static void __exit af_unix_exit(void)
2439 {
2440         sock_unregister(PF_UNIX);
2441         proto_unregister(&unix_proto);
2442         unregister_pernet_subsys(&unix_net_ops);
2443 }
2444
2445 /* Earlier than device_initcall() so that other drivers invoking
2446    request_module() don't end up in a loop when modprobe tries
2447    to use a UNIX socket. But later than subsys_initcall() because
2448    we depend on stuff initialised there */
2449 fs_initcall(af_unix_init);
2450 module_exit(af_unix_exit);
2451
2452 MODULE_LICENSE("GPL");
2453 MODULE_ALIAS_NETPROTO(PF_UNIX);