Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[firefly-linux-kernel-4.4.55.git] / net / netlink / af_netlink.c
index 6560635fd25cd4bf25cf7bdbcce50416216c08dd..2a3e9ba814c4125d877554b80bc9e7353e7a21e9 100644 (file)
@@ -3,6 +3,7 @@
  *
  *             Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
  *                             Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *                             Patrick McHardy <kaber@trash.net>
  *
  *             This program is free software; you can redistribute it and/or
  *             modify it under the terms of the GNU General Public License
@@ -110,12 +111,45 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u
        return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask];
 }
 
+static void netlink_overrun(struct sock *sk)
+{
+       struct netlink_sock *nlk = nlk_sk(sk);
+
+       if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
+               if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) {
+                       sk->sk_err = ENOBUFS;
+                       sk->sk_error_report(sk);
+               }
+       }
+       atomic_inc(&sk->sk_drops);
+}
+
+static void netlink_rcv_wake(struct sock *sk)
+{
+       struct netlink_sock *nlk = nlk_sk(sk);
+
+       if (skb_queue_empty(&sk->sk_receive_queue))
+               clear_bit(NETLINK_CONGESTED, &nlk->state);
+       if (!test_bit(NETLINK_CONGESTED, &nlk->state))
+               wake_up_interruptible(&nlk->wait);
+}
+
 #ifdef CONFIG_NETLINK_MMAP
 static bool netlink_skb_is_mmaped(const struct sk_buff *skb)
 {
        return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
 }
 
+static bool netlink_rx_is_mmaped(struct sock *sk)
+{
+       return nlk_sk(sk)->rx_ring.pg_vec != NULL;
+}
+
+static bool netlink_tx_is_mmaped(struct sock *sk)
+{
+       return nlk_sk(sk)->tx_ring.pg_vec != NULL;
+}
+
 static __pure struct page *pgvec_to_page(const void *addr)
 {
        if (is_vmalloc_addr(addr))
@@ -431,12 +465,48 @@ static void netlink_forward_ring(struct netlink_ring *ring)
        } while (ring->head != head);
 }
 
+static bool netlink_dump_space(struct netlink_sock *nlk)
+{
+       struct netlink_ring *ring = &nlk->rx_ring;
+       struct nl_mmap_hdr *hdr;
+       unsigned int n;
+
+       hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
+       if (hdr == NULL)
+               return false;
+
+       n = ring->head + ring->frame_max / 2;
+       if (n > ring->frame_max)
+               n -= ring->frame_max;
+
+       hdr = __netlink_lookup_frame(ring, n);
+
+       return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
+}
+
 static unsigned int netlink_poll(struct file *file, struct socket *sock,
                                 poll_table *wait)
 {
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int mask;
+       int err;
+
+       if (nlk->rx_ring.pg_vec != NULL) {
+               /* Memory mapped sockets don't call recvmsg(), so flow control
+                * for dumps is performed here. A dump is allowed to continue
+                * if at least half the ring is unused.
+                */
+               while (nlk->cb != NULL && netlink_dump_space(nlk)) {
+                       err = netlink_dump(sk);
+                       if (err < 0) {
+                               sk->sk_err = err;
+                               sk->sk_error_report(sk);
+                               break;
+                       }
+               }
+               netlink_rcv_wake(sk);
+       }
 
        mask = datagram_poll(file, sock, wait);
 
@@ -483,10 +553,155 @@ static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
        NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
        NETLINK_CB(skb).sk = sk;
 }
+
+static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
+                               u32 dst_portid, u32 dst_group,
+                               struct sock_iocb *siocb)
+{
+       struct netlink_sock *nlk = nlk_sk(sk);
+       struct netlink_ring *ring;
+       struct nl_mmap_hdr *hdr;
+       struct sk_buff *skb;
+       unsigned int maxlen;
+       bool excl = true;
+       int err = 0, len = 0;
+
+       /* Netlink messages are validated by the receiver before processing.
+        * In order to avoid userspace changing the contents of the message
+        * after validation, the socket and the ring may only be used by a
+        * single process, otherwise we fall back to copying.
+        */
+       if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 ||
+           atomic_read(&nlk->mapped) > 1)
+               excl = false;
+
+       mutex_lock(&nlk->pg_vec_lock);
+
+       ring   = &nlk->tx_ring;
+       maxlen = ring->frame_size - NL_MMAP_HDRLEN;
+
+       do {
+               hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
+               if (hdr == NULL) {
+                       if (!(msg->msg_flags & MSG_DONTWAIT) &&
+                           atomic_read(&nlk->tx_ring.pending))
+                               schedule();
+                       continue;
+               }
+               if (hdr->nm_len > maxlen) {
+                       err = -EINVAL;
+                       goto out;
+               }
+
+               netlink_frame_flush_dcache(hdr);
+
+               if (likely(dst_portid == 0 && dst_group == 0 && excl)) {
+                       skb = alloc_skb_head(GFP_KERNEL);
+                       if (skb == NULL) {
+                               err = -ENOBUFS;
+                               goto out;
+                       }
+                       sock_hold(sk);
+                       netlink_ring_setup_skb(skb, sk, ring, hdr);
+                       NETLINK_CB(skb).flags |= NETLINK_SKB_TX;
+                       __skb_put(skb, hdr->nm_len);
+                       netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
+                       atomic_inc(&ring->pending);
+               } else {
+                       skb = alloc_skb(hdr->nm_len, GFP_KERNEL);
+                       if (skb == NULL) {
+                               err = -ENOBUFS;
+                               goto out;
+                       }
+                       __skb_put(skb, hdr->nm_len);
+                       memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len);
+                       netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
+               }
+
+               netlink_increment_head(ring);
+
+               NETLINK_CB(skb).portid    = nlk->portid;
+               NETLINK_CB(skb).dst_group = dst_group;
+               NETLINK_CB(skb).creds     = siocb->scm->creds;
+
+               err = security_netlink_send(sk, skb);
+               if (err) {
+                       kfree_skb(skb);
+                       goto out;
+               }
+
+               if (unlikely(dst_group)) {
+                       atomic_inc(&skb->users);
+                       netlink_broadcast(sk, skb, dst_portid, dst_group,
+                                         GFP_KERNEL);
+               }
+               err = netlink_unicast(sk, skb, dst_portid,
+                                     msg->msg_flags & MSG_DONTWAIT);
+               if (err < 0)
+                       goto out;
+               len += err;
+
+       } while (hdr != NULL ||
+                (!(msg->msg_flags & MSG_DONTWAIT) &&
+                 atomic_read(&nlk->tx_ring.pending)));
+
+       if (len > 0)
+               err = len;
+out:
+       mutex_unlock(&nlk->pg_vec_lock);
+       return err;
+}
+
+static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
+{
+       struct nl_mmap_hdr *hdr;
+
+       hdr = netlink_mmap_hdr(skb);
+       hdr->nm_len     = skb->len;
+       hdr->nm_group   = NETLINK_CB(skb).dst_group;
+       hdr->nm_pid     = NETLINK_CB(skb).creds.pid;
+       hdr->nm_uid     = NETLINK_CB(skb).creds.uid;
+       hdr->nm_gid     = NETLINK_CB(skb).creds.gid;
+       netlink_frame_flush_dcache(hdr);
+       netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+
+       NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
+       kfree_skb(skb);
+}
+
+static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
+{
+       struct netlink_sock *nlk = nlk_sk(sk);
+       struct netlink_ring *ring = &nlk->rx_ring;
+       struct nl_mmap_hdr *hdr;
+
+       spin_lock_bh(&sk->sk_receive_queue.lock);
+       hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
+       if (hdr == NULL) {
+               spin_unlock_bh(&sk->sk_receive_queue.lock);
+               kfree_skb(skb);
+               netlink_overrun(sk);
+               return;
+       }
+       netlink_increment_head(ring);
+       __skb_queue_tail(&sk->sk_receive_queue, skb);
+       spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+       hdr->nm_len     = skb->len;
+       hdr->nm_group   = NETLINK_CB(skb).dst_group;
+       hdr->nm_pid     = NETLINK_CB(skb).creds.pid;
+       hdr->nm_uid     = NETLINK_CB(skb).creds.uid;
+       hdr->nm_gid     = NETLINK_CB(skb).creds.gid;
+       netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
+}
+
 #else /* CONFIG_NETLINK_MMAP */
 #define netlink_skb_is_mmaped(skb)     false
+#define netlink_rx_is_mmaped(sk)       false
+#define netlink_tx_is_mmaped(sk)       false
 #define netlink_mmap                   sock_no_mmap
 #define netlink_poll                   datagram_poll
+#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb)    0
 #endif /* CONFIG_NETLINK_MMAP */
 
 static void netlink_destroy_callback(struct netlink_callback *cb)
@@ -517,11 +732,16 @@ static void netlink_skb_destructor(struct sk_buff *skb)
                hdr = netlink_mmap_hdr(skb);
                sk = NETLINK_CB(skb).sk;
 
-               if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
-                       hdr->nm_len = 0;
-                       netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+               if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
+                       netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
+                       ring = &nlk_sk(sk)->tx_ring;
+               } else {
+                       if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
+                               hdr->nm_len = 0;
+                               netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+                       }
+                       ring = &nlk_sk(sk)->rx_ring;
                }
-               ring = &nlk_sk(sk)->rx_ring;
 
                WARN_ON(atomic_read(&ring->pending) == 0);
                atomic_dec(&ring->pending);
@@ -1165,19 +1385,6 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr,
        return 0;
 }
 
-static void netlink_overrun(struct sock *sk)
-{
-       struct netlink_sock *nlk = nlk_sk(sk);
-
-       if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
-               if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) {
-                       sk->sk_err = ENOBUFS;
-                       sk->sk_error_report(sk);
-               }
-       }
-       atomic_inc(&sk->sk_drops);
-}
-
 static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
 {
        struct sock *sock;
@@ -1230,8 +1437,9 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 
        nlk = nlk_sk(sk);
 
-       if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-           test_bit(NETLINK_CONGESTED, &nlk->state)) {
+       if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+            test_bit(NETLINK_CONGESTED, &nlk->state)) &&
+           !netlink_skb_is_mmaped(skb)) {
                DECLARE_WAITQUEUE(wait, current);
                if (!*timeo) {
                        if (!ssk || netlink_is_kernel(ssk))
@@ -1267,7 +1475,14 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
 {
        int len = skb->len;
 
-       skb_queue_tail(&sk->sk_receive_queue, skb);
+#ifdef CONFIG_NETLINK_MMAP
+       if (netlink_skb_is_mmaped(skb))
+               netlink_queue_mmaped_skb(sk, skb);
+       else if (netlink_rx_is_mmaped(sk))
+               netlink_ring_set_copied(sk, skb);
+       else
+#endif /* CONFIG_NETLINK_MMAP */
+               skb_queue_tail(&sk->sk_receive_queue, skb);
        sk->sk_data_ready(sk, len);
        return len;
 }
@@ -1291,6 +1506,8 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
        int delta;
 
        WARN_ON(skb->sk != NULL);
+       if (netlink_skb_is_mmaped(skb))
+               return skb;
 
        delta = skb->end - skb->tail;
        if (delta * 2 < skb->truesize)
@@ -1310,16 +1527,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
        return skb;
 }
 
-static void netlink_rcv_wake(struct sock *sk)
-{
-       struct netlink_sock *nlk = nlk_sk(sk);
-
-       if (skb_queue_empty(&sk->sk_receive_queue))
-               clear_bit(NETLINK_CONGESTED, &nlk->state);
-       if (!test_bit(NETLINK_CONGESTED, &nlk->state))
-               wake_up_interruptible(&nlk->wait);
-}
-
 static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
                                  struct sock *ssk)
 {
@@ -1376,6 +1583,69 @@ retry:
 }
 EXPORT_SYMBOL(netlink_unicast);
 
+struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
+                                 u32 dst_portid, gfp_t gfp_mask)
+{
+#ifdef CONFIG_NETLINK_MMAP
+       struct sock *sk = NULL;
+       struct sk_buff *skb;
+       struct netlink_ring *ring;
+       struct nl_mmap_hdr *hdr;
+       unsigned int maxlen;
+
+       sk = netlink_getsockbyportid(ssk, dst_portid);
+       if (IS_ERR(sk))
+               goto out;
+
+       ring = &nlk_sk(sk)->rx_ring;
+       /* fast-path without atomic ops for common case: non-mmaped receiver */
+       if (ring->pg_vec == NULL)
+               goto out_put;
+
+       skb = alloc_skb_head(gfp_mask);
+       if (skb == NULL)
+               goto err1;
+
+       spin_lock_bh(&sk->sk_receive_queue.lock);
+       /* check again under lock */
+       if (ring->pg_vec == NULL)
+               goto out_free;
+
+       maxlen = ring->frame_size - NL_MMAP_HDRLEN;
+       if (maxlen < size)
+               goto out_free;
+
+       netlink_forward_ring(ring);
+       hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
+       if (hdr == NULL)
+               goto err2;
+       netlink_ring_setup_skb(skb, sk, ring, hdr);
+       netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
+       atomic_inc(&ring->pending);
+       netlink_increment_head(ring);
+
+       spin_unlock_bh(&sk->sk_receive_queue.lock);
+       return skb;
+
+err2:
+       kfree_skb(skb);
+       spin_unlock_bh(&sk->sk_receive_queue.lock);
+       netlink_overrun(sk);
+err1:
+       sock_put(sk);
+       return NULL;
+
+out_free:
+       kfree_skb(skb);
+       spin_unlock_bh(&sk->sk_receive_queue.lock);
+out_put:
+       sock_put(sk);
+out:
+#endif
+       return alloc_skb(size, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(netlink_alloc_skb);
+
 int netlink_has_listeners(struct sock *sk, unsigned int group)
 {
        int res = 0;
@@ -1815,6 +2085,13 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
                        goto out;
        }
 
+       if (netlink_tx_is_mmaped(sk) &&
+           msg->msg_iov->iov_base == NULL) {
+               err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
+                                          siocb);
+               goto out;
+       }
+
        err = -EMSGSIZE;
        if (len > sk->sk_sndbuf - 32)
                goto out;
@@ -2147,9 +2424,13 @@ static int netlink_dump(struct sock *sk)
 
        alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);
 
-       skb = sock_rmalloc(sk, alloc_size, 0, GFP_KERNEL);
+       if (!netlink_rx_is_mmaped(sk) &&
+           atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+               goto errout_skb;
+       skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL);
        if (!skb)
                goto errout_skb;
+       netlink_skb_set_owner_r(skb, sk);
 
        len = cb->dump(skb, cb);
 
@@ -2204,6 +2485,19 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
        if (cb == NULL)
                return -ENOBUFS;
 
+       /* Memory mapped dump requests need to be copied to avoid looping
+        * on the pending state in netlink_mmap_sendmsg() while the CB hold
+        * a reference to the skb.
+        */
+       if (netlink_skb_is_mmaped(skb)) {
+               skb = skb_copy(skb, GFP_KERNEL);
+               if (skb == NULL) {
+                       kfree(cb);
+                       return -ENOBUFS;
+               }
+       } else
+               atomic_inc(&skb->users);
+
        cb->dump = control->dump;
        cb->done = control->done;
        cb->nlh = nlh;
@@ -2264,7 +2558,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
        if (err)
                payload += nlmsg_len(nlh);
 
-       skb = nlmsg_new(payload, GFP_KERNEL);
+       skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
+                               NETLINK_CB(in_skb).portid, GFP_KERNEL);
        if (!skb) {
                struct sock *sk;