summaryrefslogtreecommitdiff
path: root/net/unix
diff options
context:
space:
mode:
Diffstat (limited to 'net/unix')
-rw-r--r--net/unix/Kconfig5
-rw-r--r--net/unix/Makefile2
-rw-r--r--net/unix/af_unix.c151
-rw-r--r--net/unix/garbage.c68
-rw-r--r--net/unix/scm.c161
-rw-r--r--net/unix/scm.h10
6 files changed, 254 insertions, 143 deletions
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
index 8b31ab85d050..3b9e450656a4 100644
--- a/net/unix/Kconfig
+++ b/net/unix/Kconfig
@@ -19,6 +19,11 @@ config UNIX
Say Y unless you know what you are doing.
+config UNIX_SCM
+ bool
+ depends on UNIX
+ default y
+
config UNIX_DIAG
tristate "UNIX: socket monitoring interface"
depends on UNIX
diff --git a/net/unix/Makefile b/net/unix/Makefile
index b663c607b1c6..dc686c6757fb 100644
--- a/net/unix/Makefile
+++ b/net/unix/Makefile
@@ -9,3 +9,5 @@ unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o
obj-$(CONFIG_UNIX_DIAG) += unix_diag.o
unix_diag-y := diag.o
+
+obj-$(CONFIG_UNIX_SCM) += scm.o
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 33948cc03ba6..242d170991b4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -118,6 +118,8 @@
#include <linux/security.h>
#include <linux/freezer.h>
+#include "scm.h"
+
struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
EXPORT_SYMBOL_GPL(unix_socket_table);
DEFINE_SPINLOCK(unix_table_lock);
@@ -592,20 +594,42 @@ static void unix_release_sock(struct sock *sk, int embrion)
static void init_peercred(struct sock *sk)
{
- put_pid(sk->sk_peer_pid);
- if (sk->sk_peer_cred)
- put_cred(sk->sk_peer_cred);
+ const struct cred *old_cred;
+ struct pid *old_pid;
+
+ spin_lock(&sk->sk_peer_lock);
+ old_pid = sk->sk_peer_pid;
+ old_cred = sk->sk_peer_cred;
sk->sk_peer_pid = get_pid(task_tgid(current));
sk->sk_peer_cred = get_current_cred();
+ spin_unlock(&sk->sk_peer_lock);
+
+ put_pid(old_pid);
+ put_cred(old_cred);
}
static void copy_peercred(struct sock *sk, struct sock *peersk)
{
- put_pid(sk->sk_peer_pid);
- if (sk->sk_peer_cred)
- put_cred(sk->sk_peer_cred);
+ const struct cred *old_cred;
+ struct pid *old_pid;
+
+ if (sk < peersk) {
+ spin_lock(&sk->sk_peer_lock);
+ spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&peersk->sk_peer_lock);
+ spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
+ }
+ old_pid = sk->sk_peer_pid;
+ old_cred = sk->sk_peer_cred;
sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
+
+ spin_unlock(&sk->sk_peer_lock);
+ spin_unlock(&peersk->sk_peer_lock);
+
+ put_pid(old_pid);
+ put_cred(old_cred);
}
static int unix_listen(struct socket *sock, int backlog)
@@ -1504,78 +1528,51 @@ out:
return err;
}
-static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
-{
- int i;
-
- scm->fp = UNIXCB(skb).fp;
- UNIXCB(skb).fp = NULL;
-
- for (i = scm->fp->count-1; i >= 0; i--)
- unix_notinflight(scm->fp->user, scm->fp->fp[i]);
-}
-
-static void unix_destruct_scm(struct sk_buff *skb)
+static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
- struct scm_cookie scm;
- memset(&scm, 0, sizeof(scm));
- scm.pid = UNIXCB(skb).pid;
- if (UNIXCB(skb).fp)
- unix_detach_fds(&scm, skb);
-
- /* Alas, it calls VFS */
- /* So fscking what? fput() had been SMP-safe since the last Summer */
- scm_destroy(&scm);
- sock_wfree(skb);
-}
-
-/*
- * The "user->unix_inflight" variable is protected by the garbage
- * collection lock, and we just read it locklessly here. If you go
- * over the limit, there might be a tiny race in actually noticing
- * it across threads. Tough.
- */
-static inline bool too_many_unix_fds(struct task_struct *p)
-{
- struct user_struct *user = current_user();
-
- if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
- return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
- return false;
-}
-
-#define MAX_RECURSION_LEVEL 4
-
-static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
-{
- int i;
- unsigned char max_level = 0;
-
- if (too_many_unix_fds(current))
- return -ETOOMANYREFS;
-
- for (i = scm->fp->count - 1; i >= 0; i--) {
- struct sock *sk = unix_get_socket(scm->fp->fp[i]);
-
- if (sk)
- max_level = max(max_level,
- unix_sk(sk)->recursion_level);
- }
- if (unlikely(max_level > MAX_RECURSION_LEVEL))
- return -ETOOMANYREFS;
+ scm->fp = scm_fp_dup(UNIXCB(skb).fp);
/*
- * Need to duplicate file references for the sake of garbage
- * collection. Otherwise a socket in the fps might become a
- * candidate for GC while the skb is not yet queued.
+ * Garbage collection of unix sockets starts by selecting a set of
+ * candidate sockets which have reference only from being in flight
+ * (total_refs == inflight_refs). This condition is checked once during
+ * the candidate collection phase, and candidates are marked as such, so
+ * that non-candidates can later be ignored. While inflight_refs is
+ * protected by unix_gc_lock, total_refs (file count) is not, hence this
+ * is an instantaneous decision.
+ *
+ * Once a candidate, however, the socket must not be reinstalled into a
+ * file descriptor while the garbage collection is in progress.
+ *
+ * If the above conditions are met, then the directed graph of
+ * candidates (*) does not change while unix_gc_lock is held.
+ *
+ * Any operations that changes the file count through file descriptors
+ * (dup, close, sendmsg) does not change the graph since candidates are
+ * not installed in fds.
+ *
+ * Dequeing a candidate via recvmsg would install it into an fd, but
+ * that takes unix_gc_lock to decrement the inflight count, so it's
+ * serialized with garbage collection.
+ *
+ * MSG_PEEK is special in that it does not change the inflight count,
+ * yet does install the socket into an fd. The following lock/unlock
+ * pair is to ensure serialization with garbage collection. It must be
+ * done between incrementing the file count and installing the file into
+ * an fd.
+ *
+ * If garbage collection starts after the barrier provided by the
+ * lock/unlock, then it will see the elevated refcount and not mark this
+ * as a candidate. If a garbage collection is already in progress
+ * before the file count was incremented, then the lock/unlock pair will
+ * ensure that garbage collection is finished before progressing to
+ * installing the fd.
+ *
+ * (*) A -> B where B is on the queue of A or B is on the queue of C
+ * which is on the queue of listening socket A.
*/
- UNIXCB(skb).fp = scm_fp_dup(scm->fp);
- if (!UNIXCB(skb).fp)
- return -ENOMEM;
-
- for (i = scm->fp->count - 1; i >= 0; i--)
- unix_inflight(scm->fp->user, scm->fp->fp[i]);
- return max_level;
+ spin_lock(&unix_gc_lock);
+ spin_unlock(&unix_gc_lock);
}
static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
@@ -2203,7 +2200,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
sk_peek_offset_fwd(sk, size);
if (UNIXCB(skb).fp)
- scm.fp = scm_fp_dup(UNIXCB(skb).fp);
+ unix_peek_fds(&scm, skb);
}
err = (flags & MSG_TRUNC) ? skb->len - skip : size;
@@ -2448,7 +2445,7 @@ unlock:
/* It is questionable, see note in unix_dgram_recvmsg.
*/
if (UNIXCB(skb).fp)
- scm.fp = scm_fp_dup(UNIXCB(skb).fp);
+ unix_peek_fds(&scm, skb);
sk_peek_offset_fwd(sk, chunk);
@@ -2725,7 +2722,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
other = unix_peer(sk);
if (other && unix_peer(other) != sk &&
- unix_recvq_full(other) &&
+ unix_recvq_full_lockless(other) &&
unix_dgram_peer_wake_me(sk, other))
writable = 0;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index c36757e72844..8bbe1b8e4ff7 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -86,77 +86,13 @@
#include <net/scm.h>
#include <net/tcp_states.h>
+#include "scm.h"
+
/* Internal data structures and random procedures: */
-static LIST_HEAD(gc_inflight_list);
static LIST_HEAD(gc_candidates);
-static DEFINE_SPINLOCK(unix_gc_lock);
static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
-unsigned int unix_tot_inflight;
-
-struct sock *unix_get_socket(struct file *filp)
-{
- struct sock *u_sock = NULL;
- struct inode *inode = file_inode(filp);
-
- /* Socket ? */
- if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
- struct socket *sock = SOCKET_I(inode);
- struct sock *s = sock->sk;
-
- /* PF_UNIX ? */
- if (s && sock->ops && sock->ops->family == PF_UNIX)
- u_sock = s;
- }
- return u_sock;
-}
-
-/* Keep the number of times in flight count for the file
- * descriptor if it is for an AF_UNIX socket.
- */
-
-void unix_inflight(struct user_struct *user, struct file *fp)
-{
- struct sock *s = unix_get_socket(fp);
-
- spin_lock(&unix_gc_lock);
-
- if (s) {
- struct unix_sock *u = unix_sk(s);
-
- if (atomic_long_inc_return(&u->inflight) == 1) {
- BUG_ON(!list_empty(&u->link));
- list_add_tail(&u->link, &gc_inflight_list);
- } else {
- BUG_ON(list_empty(&u->link));
- }
- unix_tot_inflight++;
- }
- user->unix_inflight++;
- spin_unlock(&unix_gc_lock);
-}
-
-void unix_notinflight(struct user_struct *user, struct file *fp)
-{
- struct sock *s = unix_get_socket(fp);
-
- spin_lock(&unix_gc_lock);
-
- if (s) {
- struct unix_sock *u = unix_sk(s);
-
- BUG_ON(!atomic_long_read(&u->inflight));
- BUG_ON(list_empty(&u->link));
-
- if (atomic_long_dec_and_test(&u->inflight))
- list_del_init(&u->link);
- unix_tot_inflight--;
- }
- user->unix_inflight--;
- spin_unlock(&unix_gc_lock);
-}
-
static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
struct sk_buff_head *hitlist)
{
diff --git a/net/unix/scm.c b/net/unix/scm.c
new file mode 100644
index 000000000000..df8f636ab1d8
--- /dev/null
+++ b/net/unix/scm.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/fs.h>
+#include <net/af_unix.h>
+#include <net/scm.h>
+#include <linux/init.h>
+
+#include "scm.h"
+
+unsigned int unix_tot_inflight;
+EXPORT_SYMBOL(unix_tot_inflight);
+
+LIST_HEAD(gc_inflight_list);
+EXPORT_SYMBOL(gc_inflight_list);
+
+DEFINE_SPINLOCK(unix_gc_lock);
+EXPORT_SYMBOL(unix_gc_lock);
+
+struct sock *unix_get_socket(struct file *filp)
+{
+ struct sock *u_sock = NULL;
+ struct inode *inode = file_inode(filp);
+
+ /* Socket ? */
+ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
+ struct socket *sock = SOCKET_I(inode);
+ struct sock *s = sock->sk;
+
+ /* PF_UNIX ? */
+ if (s && sock->ops && sock->ops->family == PF_UNIX)
+ u_sock = s;
+ }
+ return u_sock;
+}
+EXPORT_SYMBOL(unix_get_socket);
+
+/* Keep the number of times in flight count for the file
+ * descriptor if it is for an AF_UNIX socket.
+ */
+void unix_inflight(struct user_struct *user, struct file *fp)
+{
+ struct sock *s = unix_get_socket(fp);
+
+ spin_lock(&unix_gc_lock);
+
+ if (s) {
+ struct unix_sock *u = unix_sk(s);
+
+ if (atomic_long_inc_return(&u->inflight) == 1) {
+ BUG_ON(!list_empty(&u->link));
+ list_add_tail(&u->link, &gc_inflight_list);
+ } else {
+ BUG_ON(list_empty(&u->link));
+ }
+ unix_tot_inflight++;
+ }
+ user->unix_inflight++;
+ spin_unlock(&unix_gc_lock);
+}
+
+void unix_notinflight(struct user_struct *user, struct file *fp)
+{
+ struct sock *s = unix_get_socket(fp);
+
+ spin_lock(&unix_gc_lock);
+
+ if (s) {
+ struct unix_sock *u = unix_sk(s);
+
+ BUG_ON(!atomic_long_read(&u->inflight));
+ BUG_ON(list_empty(&u->link));
+
+ if (atomic_long_dec_and_test(&u->inflight))
+ list_del_init(&u->link);
+ unix_tot_inflight--;
+ }
+ user->unix_inflight--;
+ spin_unlock(&unix_gc_lock);
+}
+
+/*
+ * The "user->unix_inflight" variable is protected by the garbage
+ * collection lock, and we just read it locklessly here. If you go
+ * over the limit, there might be a tiny race in actually noticing
+ * it across threads. Tough.
+ */
+static inline bool too_many_unix_fds(struct task_struct *p)
+{
+ struct user_struct *user = current_user();
+
+ if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
+ return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+ return false;
+}
+
+#define MAX_RECURSION_LEVEL 4
+
+int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ int i;
+ unsigned char max_level = 0;
+
+ if (too_many_unix_fds(current))
+ return -ETOOMANYREFS;
+
+ for (i = scm->fp->count - 1; i >= 0; i--) {
+ struct sock *sk = unix_get_socket(scm->fp->fp[i]);
+
+ if (sk)
+ max_level = max(max_level,
+ unix_sk(sk)->recursion_level);
+ }
+ if (unlikely(max_level > MAX_RECURSION_LEVEL))
+ return -ETOOMANYREFS;
+
+ /*
+ * Need to duplicate file references for the sake of garbage
+ * collection. Otherwise a socket in the fps might become a
+ * candidate for GC while the skb is not yet queued.
+ */
+ UNIXCB(skb).fp = scm_fp_dup(scm->fp);
+ if (!UNIXCB(skb).fp)
+ return -ENOMEM;
+
+ for (i = scm->fp->count - 1; i >= 0; i--)
+ unix_inflight(scm->fp->user, scm->fp->fp[i]);
+ return max_level;
+}
+EXPORT_SYMBOL(unix_attach_fds);
+
+void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ int i;
+
+ scm->fp = UNIXCB(skb).fp;
+ UNIXCB(skb).fp = NULL;
+
+ for (i = scm->fp->count-1; i >= 0; i--)
+ unix_notinflight(scm->fp->user, scm->fp->fp[i]);
+}
+EXPORT_SYMBOL(unix_detach_fds);
+
+void unix_destruct_scm(struct sk_buff *skb)
+{
+ struct scm_cookie scm;
+
+ memset(&scm, 0, sizeof(scm));
+ scm.pid = UNIXCB(skb).pid;
+ if (UNIXCB(skb).fp)
+ unix_detach_fds(&scm, skb);
+
+ /* Alas, it calls VFS */
+ /* So fscking what? fput() had been SMP-safe since the last Summer */
+ scm_destroy(&scm);
+ sock_wfree(skb);
+}
+EXPORT_SYMBOL(unix_destruct_scm);
diff --git a/net/unix/scm.h b/net/unix/scm.h
new file mode 100644
index 000000000000..5a255a477f16
--- /dev/null
+++ b/net/unix/scm.h
@@ -0,0 +1,10 @@
+#ifndef NET_UNIX_SCM_H
+#define NET_UNIX_SCM_H
+
+extern struct list_head gc_inflight_list;
+extern spinlock_t unix_gc_lock;
+
+int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb);
+void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb);
+
+#endif