diff options
Diffstat (limited to 'net/unix/af_unix.c')
| -rw-r--r-- | net/unix/af_unix.c | 500 | 
1 files changed, 287 insertions, 213 deletions
| diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9a6ad5974dff..0be0dcb07f7b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -126,6 +126,81 @@ static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];   *    hash table is protected with spinlock.   *    each socket state is protected by separate spinlock.   */ +#ifdef CONFIG_PROVE_LOCKING +#define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r))) + +static int unix_table_lock_cmp_fn(const struct lockdep_map *a, +				  const struct lockdep_map *b) +{ +	return cmp_ptr(a, b); +} + +static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, +				  const struct lockdep_map *_b) +{ +	const struct unix_sock *a, *b; + +	a = container_of(_a, struct unix_sock, lock.dep_map); +	b = container_of(_b, struct unix_sock, lock.dep_map); + +	if (a->sk.sk_state == TCP_LISTEN) { +		/* unix_stream_connect(): Before the 2nd unix_state_lock(), +		 * +		 *   1. a is TCP_LISTEN. +		 *   2. b is not a. +		 *   3. concurrent connect(b -> a) must fail. +		 * +		 * Except for 2. & 3., the b's state can be any possible +		 * value due to concurrent connect() or listen(). +		 * +		 * 2. is detected in debug_spin_lock_before(), and 3. cannot +		 * be expressed as lock_cmp_fn. +		 */ +		switch (b->sk.sk_state) { +		case TCP_CLOSE: +		case TCP_ESTABLISHED: +		case TCP_LISTEN: +			return -1; +		default: +			/* Invalid case. */ +			return 0; +		} +	} + +	/* Should never happen.  Just to be symmetric. */ +	if (b->sk.sk_state == TCP_LISTEN) { +		switch (b->sk.sk_state) { +		case TCP_CLOSE: +		case TCP_ESTABLISHED: +			return 1; +		default: +			return 0; +		} +	} + +	/* unix_state_double_lock(): ascending address order. */ +	return cmp_ptr(a, b); +} + +static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, +				  const struct lockdep_map *_b) +{ +	const struct sock *a, *b; + +	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); +	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); + +	/* unix_collect_skb(): listener -> embryo order. */ +	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) +		return -1; + +	/* Should never happen.  Just to be symmetric. */ +	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) +		return 1; + +	return 0; +} +#endif  static unsigned int unix_unbound_hash(struct sock *sk)  { @@ -168,7 +243,7 @@ static void unix_table_double_lock(struct net *net,  		swap(hash1, hash2);  	spin_lock(&net->unx.table.locks[hash1]); -	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); +	spin_lock(&net->unx.table.locks[hash2]);  }  static void unix_table_double_unlock(struct net *net, @@ -221,15 +296,9 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk)  	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);  } -static inline int unix_recvq_full(const struct sock *sk) -{ -	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; -} -  static inline int unix_recvq_full_lockless(const struct sock *sk)  { -	return skb_queue_len_lockless(&sk->sk_receive_queue) > -		READ_ONCE(sk->sk_max_ack_backlog); +	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;  }  struct sock *unix_peer_get(struct sock *s) @@ -530,10 +599,10 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)  	return 0;  } -static int unix_writable(const struct sock *sk) +static int unix_writable(const struct sock *sk, unsigned char state)  { -	return sk->sk_state != TCP_LISTEN && -	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; +	return state != TCP_LISTEN && +		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);  }  static void unix_write_space(struct sock *sk) @@ -541,12 +610,12 @@ static void unix_write_space(struct sock *sk)  	struct socket_wq *wq;  	rcu_read_lock(); -	if (unix_writable(sk)) { +	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {  		wq = rcu_dereference(sk->sk_wq);  		if (skwq_has_sleeper(wq))  			wake_up_interruptible_sync_poll(&wq->wait,  				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); -		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); +		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);  	}  	rcu_read_unlock();  } @@ -570,7 +639,6 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other)  			sk_error_report(other);  		}  	} -	other->sk_state = TCP_CLOSE;  }  static void unix_sock_destructor(struct sock *sk) @@ -617,7 +685,7 @@ static void unix_release_sock(struct sock *sk, int embrion)  	u->path.dentry = NULL;  	u->path.mnt = NULL;  	state = sk->sk_state; -	sk->sk_state = TCP_CLOSE; +	WRITE_ONCE(sk->sk_state, TCP_CLOSE);  	skpair = unix_peer(sk);  	unix_peer(sk) = NULL; @@ -638,7 +706,7 @@ static void unix_release_sock(struct sock *sk, int embrion)  			unix_state_lock(skpair);  			/* No more writes */  			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); -			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) +			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)  				WRITE_ONCE(skpair->sk_err, ECONNRESET);  			unix_state_unlock(skpair);  			skpair->sk_state_change(skpair); @@ -654,8 +722,8 @@ static void unix_release_sock(struct sock *sk, int embrion)  	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {  		if (state == TCP_LISTEN)  			unix_release_sock(skb->sk, 1); +  		/* passed fds are erased in the kfree_skb hook	      */ -		UNIXCB(skb).consumed = skb->len;  		kfree_skb(skb);  	} @@ -683,14 +751,19 @@ static void unix_release_sock(struct sock *sk, int embrion)  static void init_peercred(struct sock *sk)  { +	sk->sk_peer_pid = get_pid(task_tgid(current)); +	sk->sk_peer_cred = get_current_cred(); +} + +static void update_peercred(struct sock *sk) +{  	const struct cred *old_cred;  	struct pid *old_pid;  	spin_lock(&sk->sk_peer_lock);  	old_pid = sk->sk_peer_pid;  	old_cred = sk->sk_peer_cred; -	sk->sk_peer_pid  = get_pid(task_tgid(current)); -	sk->sk_peer_cred = get_current_cred(); +	init_peercred(sk);  	spin_unlock(&sk->sk_peer_lock);  	put_pid(old_pid); @@ -699,26 +772,12 @@ static void init_peercred(struct sock *sk)  static void copy_peercred(struct sock *sk, struct sock *peersk)  { -	const struct cred *old_cred; -	struct pid *old_pid; +	lockdep_assert_held(&unix_sk(peersk)->lock); -	if (sk < peersk) { -		spin_lock(&sk->sk_peer_lock); -		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); -	} else { -		spin_lock(&peersk->sk_peer_lock); -		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); -	} -	old_pid = sk->sk_peer_pid; -	old_cred = sk->sk_peer_cred; -	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid); +	spin_lock(&sk->sk_peer_lock); +	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);  	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); -  	spin_unlock(&sk->sk_peer_lock); -	spin_unlock(&peersk->sk_peer_lock); - -	put_pid(old_pid); -	put_cred(old_cred);  }  static int unix_listen(struct socket *sock, int backlog) @@ -731,7 +790,7 @@ static int unix_listen(struct socket *sock, int backlog)  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)  		goto out;	/* Only stream/seqpacket sockets accept */  	err = -EINVAL; -	if (!u->addr) +	if (!READ_ONCE(u->addr))  		goto out;	/* No listens on an unbound socket */  	unix_state_lock(sk);  	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) @@ -739,9 +798,10 @@ static int unix_listen(struct socket *sock, int backlog)  	if (backlog > sk->sk_max_ack_backlog)  		wake_up_interruptible_all(&u->peer_wait);  	sk->sk_max_ack_backlog	= backlog; -	sk->sk_state		= TCP_LISTEN; +	WRITE_ONCE(sk->sk_state, TCP_LISTEN); +  	/* set credentials so connect can copy them */ -	init_peercred(sk); +	update_peercred(sk);  	err = 0;  out_unlock: @@ -755,7 +815,7 @@ static int unix_bind(struct socket *, struct sockaddr *, int);  static int unix_stream_connect(struct socket *, struct sockaddr *,  			       int addr_len, int flags);  static int unix_socketpair(struct socket *, struct socket *); -static int unix_accept(struct socket *, struct socket *, int, bool); +static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);  static int unix_getname(struct socket *, struct sockaddr *, int);  static __poll_t unix_poll(struct file *, struct socket *, poll_table *);  static __poll_t unix_dgram_poll(struct file *, struct socket *, @@ -976,14 +1036,17 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,  	sk->sk_hash		= unix_unbound_hash(sk);  	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;  	sk->sk_write_space	= unix_write_space; -	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen; +	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);  	sk->sk_destruct		= unix_sock_destructor; +	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); +  	u = unix_sk(sk); -	u->inflight = 0; +	u->listener = NULL; +	u->vertex = NULL;  	u->path.dentry = NULL;  	u->path.mnt = NULL;  	spin_lock_init(&u->lock); -	INIT_LIST_HEAD(&u->link); +	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);  	mutex_init(&u->iolock); /* single task reading lock */  	mutex_init(&u->bindlock); /* single task binding lock */  	init_waitqueue_head(&u->peer_wait); @@ -1131,8 +1194,8 @@ static struct sock *unix_find_other(struct net *net,  static int unix_autobind(struct sock *sk)  { -	unsigned int new_hash, old_hash = sk->sk_hash;  	struct unix_sock *u = unix_sk(sk); +	unsigned int new_hash, old_hash;  	struct net *net = sock_net(sk);  	struct unix_address *addr;  	u32 lastnum, ordernum; @@ -1155,6 +1218,7 @@ static int unix_autobind(struct sock *sk)  	addr->name->sun_family = AF_UNIX;  	refcount_set(&addr->refcnt, 1); +	old_hash = sk->sk_hash;  	ordernum = get_random_u32();  	lastnum = ordernum & 0xFFFFF;  retry: @@ -1195,8 +1259,8 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,  {  	umode_t mode = S_IFSOCK |  	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); -	unsigned int new_hash, old_hash = sk->sk_hash;  	struct unix_sock *u = unix_sk(sk); +	unsigned int new_hash, old_hash;  	struct net *net = sock_net(sk);  	struct mnt_idmap *idmap;  	struct unix_address *addr; @@ -1234,6 +1298,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,  	if (u->addr)  		goto out_unlock; +	old_hash = sk->sk_hash;  	new_hash = unix_bsd_hash(d_backing_inode(dentry));  	unix_table_double_lock(net, old_hash, new_hash);  	u->path.mnt = mntget(parent.mnt); @@ -1261,8 +1326,8 @@ out:  static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,  			      int addr_len)  { -	unsigned int new_hash, old_hash = sk->sk_hash;  	struct unix_sock *u = unix_sk(sk); +	unsigned int new_hash, old_hash;  	struct net *net = sock_net(sk);  	struct unix_address *addr;  	int err; @@ -1280,6 +1345,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,  		goto out_mutex;  	} +	old_hash = sk->sk_hash;  	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);  	unix_table_double_lock(net, old_hash, new_hash); @@ -1329,11 +1395,12 @@ static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)  		unix_state_lock(sk1);  		return;  	} +  	if (sk1 > sk2)  		swap(sk1, sk2);  	unix_state_lock(sk1); -	unix_state_lock_nested(sk2, U_LOCK_SECOND); +	unix_state_lock(sk2);  }  static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) @@ -1369,7 +1436,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,  		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||  		     test_bit(SOCK_PASSPIDFD, &sock->flags)) && -		    !unix_sk(sk)->addr) { +		    !READ_ONCE(unix_sk(sk)->addr)) {  			err = unix_autobind(sk);  			if (err)  				goto out; @@ -1399,7 +1466,8 @@ restart:  		if (err)  			goto out_unlock; -		sk->sk_state = other->sk_state = TCP_ESTABLISHED; +		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); +		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);  	} else {  		/*  		 *	1003.1g breaking connected state with AF_UNSPEC @@ -1416,13 +1484,20 @@ restart:  		unix_peer(sk) = other;  		if (!other) -			sk->sk_state = TCP_CLOSE; +			WRITE_ONCE(sk->sk_state, TCP_CLOSE);  		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);  		unix_state_double_unlock(sk, other); -		if (other != old_peer) +		if (other != old_peer) {  			unix_dgram_disconnected(sk, old_peer); + +			unix_state_lock(old_peer); +			if (!unix_peer(old_peer)) +				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); +			unix_state_unlock(old_peer); +		} +  		sock_put(old_peer);  	} else {  		unix_peer(sk) = other; @@ -1468,9 +1543,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,  	struct unix_sock *u = unix_sk(sk), *newu, *otheru;  	struct net *net = sock_net(sk);  	struct sk_buff *skb = NULL; +	unsigned char state;  	long timeo;  	int err; -	int st;  	err = unix_validate_addr(sunaddr, addr_len);  	if (err) @@ -1481,7 +1556,8 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,  		goto out;  	if ((test_bit(SOCK_PASSCRED, &sock->flags) || -	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { +	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && +	    !READ_ONCE(u->addr)) {  		err = unix_autobind(sk);  		if (err)  			goto out; @@ -1518,7 +1594,6 @@ restart:  		goto out;  	} -	/* Latch state of peer */  	unix_state_lock(other);  	/* Apparently VFS overslept socket death. Retry. */ @@ -1534,7 +1609,7 @@ restart:  	if (other->sk_shutdown & RCV_SHUTDOWN)  		goto out_unlock; -	if (unix_recvq_full(other)) { +	if (unix_recvq_full_lockless(other)) {  		err = -EAGAIN;  		if (!timeo)  			goto out_unlock; @@ -1548,39 +1623,21 @@ restart:  		goto restart;  	} -	/* Latch our state. - -	   It is tricky place. We need to grab our state lock and cannot -	   drop lock on peer. It is dangerous because deadlock is -	   possible. Connect to self case and simultaneous -	   attempt to connect are eliminated by checking socket -	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we -	   check this before attempt to grab lock. - -	   Well, and we have to recheck the state after socket locked. +	/* self connect and simultaneous connect are eliminated +	 * by rejecting TCP_LISTEN socket to avoid deadlock.  	 */ -	st = sk->sk_state; - -	switch (st) { -	case TCP_CLOSE: -		/* This is ok... continue with connect */ -		break; -	case TCP_ESTABLISHED: -		/* Socket is already connected */ -		err = -EISCONN; -		goto out_unlock; -	default: -		err = -EINVAL; +	state = READ_ONCE(sk->sk_state); +	if (unlikely(state != TCP_CLOSE)) { +		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;  		goto out_unlock;  	} -	unix_state_lock_nested(sk, U_LOCK_SECOND); +	unix_state_lock(sk); -	if (sk->sk_state != st) { +	if (unlikely(sk->sk_state != TCP_CLOSE)) { +		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;  		unix_state_unlock(sk); -		unix_state_unlock(other); -		sock_put(other); -		goto restart; +		goto out_unlock;  	}  	err = security_unix_stream_connect(sk, other, newsk); @@ -1597,6 +1654,7 @@ restart:  	newsk->sk_type		= sk->sk_type;  	init_peercred(newsk);  	newu = unix_sk(newsk); +	newu->listener = other;  	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);  	otheru = unix_sk(other); @@ -1628,7 +1686,7 @@ restart:  	copy_peercred(sk, other);  	sock->state	= SS_CONNECTED; -	sk->sk_state	= TCP_ESTABLISHED; +	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);  	sock_hold(newsk);  	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */ @@ -1688,32 +1746,31 @@ static void unix_sock_inherit_flags(const struct socket *old,  		set_bit(SOCK_PASSSEC, &new->flags);  } -static int unix_accept(struct socket *sock, struct socket *newsock, int flags, -		       bool kern) +static int unix_accept(struct socket *sock, struct socket *newsock, +		       struct proto_accept_arg *arg)  {  	struct sock *sk = sock->sk; -	struct sock *tsk;  	struct sk_buff *skb; -	int err; +	struct sock *tsk; -	err = -EOPNOTSUPP; +	arg->err = -EOPNOTSUPP;  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)  		goto out; -	err = -EINVAL; -	if (sk->sk_state != TCP_LISTEN) +	arg->err = -EINVAL; +	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)  		goto out;  	/* If socket state is TCP_LISTEN it cannot change (for now...),  	 * so that no locks are necessary.  	 */ -	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, -				&err); +	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, +				&arg->err);  	if (!skb) {  		/* This means receive shutdown. */ -		if (err == 0) -			err = -EINVAL; +		if (arg->err == 0) +			arg->err = -EINVAL;  		goto out;  	} @@ -1723,6 +1780,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,  	/* attach accepted sock to socket */  	unix_state_lock(tsk); +	unix_update_edges(unix_sk(tsk));  	newsock->state = SS_CONNECTED;  	unix_sock_inherit_flags(sock, newsock);  	sock_graft(tsk, newsock); @@ -1730,7 +1788,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,  	return 0;  out: -	return err; +	return arg->err;  } @@ -1789,81 +1847,29 @@ static inline bool too_many_unix_fds(struct task_struct *p)  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)  { -	int i; -  	if (too_many_unix_fds(current))  		return -ETOOMANYREFS; -	/* Need to duplicate file references for the sake of garbage -	 * collection.  Otherwise a socket in the fps might become a -	 * candidate for GC while the skb is not yet queued. -	 */ -	UNIXCB(skb).fp = scm_fp_dup(scm->fp); -	if (!UNIXCB(skb).fp) -		return -ENOMEM; +	UNIXCB(skb).fp = scm->fp; +	scm->fp = NULL; -	for (i = scm->fp->count - 1; i >= 0; i--) -		unix_inflight(scm->fp->user, scm->fp->fp[i]); +	if (unix_prepare_fpl(UNIXCB(skb).fp)) +		return -ENOMEM;  	return 0;  }  static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)  { -	int i; -  	scm->fp = UNIXCB(skb).fp;  	UNIXCB(skb).fp = NULL; -	for (i = scm->fp->count - 1; i >= 0; i--) -		unix_notinflight(scm->fp->user, scm->fp->fp[i]); +	unix_destroy_fpl(scm->fp);  }  static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)  {  	scm->fp = scm_fp_dup(UNIXCB(skb).fp); - -	/* -	 * Garbage collection of unix sockets starts by selecting a set of -	 * candidate sockets which have reference only from being in flight -	 * (total_refs == inflight_refs).  This condition is checked once during -	 * the candidate collection phase, and candidates are marked as such, so -	 * that non-candidates can later be ignored.  While inflight_refs is -	 * protected by unix_gc_lock, total_refs (file count) is not, hence this -	 * is an instantaneous decision. -	 * -	 * Once a candidate, however, the socket must not be reinstalled into a -	 * file descriptor while the garbage collection is in progress. -	 * -	 * If the above conditions are met, then the directed graph of -	 * candidates (*) does not change while unix_gc_lock is held. -	 * -	 * Any operations that changes the file count through file descriptors -	 * (dup, close, sendmsg) does not change the graph since candidates are -	 * not installed in fds. -	 * -	 * Dequeing a candidate via recvmsg would install it into an fd, but -	 * that takes unix_gc_lock to decrement the inflight count, so it's -	 * serialized with garbage collection. -	 * -	 * MSG_PEEK is special in that it does not change the inflight count, -	 * yet does install the socket into an fd.  The following lock/unlock -	 * pair is to ensure serialization with garbage collection.  It must be -	 * done between incrementing the file count and installing the file into -	 * an fd. -	 * -	 * If garbage collection starts after the barrier provided by the -	 * lock/unlock, then it will see the elevated refcount and not mark this -	 * as a candidate.  If a garbage collection is already in progress -	 * before the file count was incremented, then the lock/unlock pair will -	 * ensure that garbage collection is finished before progressing to -	 * installing the fd. -	 * -	 * (*) A -> B where B is on the queue of A or B is on the queue of C -	 * which is on the queue of listening socket A. -	 */ -	spin_lock(&unix_gc_lock); -	spin_unlock(&unix_gc_lock);  }  static void unix_destruct_scm(struct sk_buff *skb) @@ -1937,8 +1943,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)  	struct scm_fp_list *fp = UNIXCB(skb).fp;  	struct unix_sock *u = unix_sk(sk); -	if (unlikely(fp && fp->count)) +	if (unlikely(fp && fp->count)) {  		atomic_add(fp->count, &u->scm_stat.nr_fds); +		unix_add_edges(fp, u); +	}  }  static void scm_stat_del(struct sock *sk, struct sk_buff *skb) @@ -1946,8 +1954,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)  	struct scm_fp_list *fp = UNIXCB(skb).fp;  	struct unix_sock *u = unix_sk(sk); -	if (unlikely(fp && fp->count)) +	if (unlikely(fp && fp->count)) {  		atomic_sub(fp->count, &u->scm_stat.nr_fds); +		unix_del_edges(fp); +	}  }  /* @@ -1997,14 +2007,15 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,  	}  	if ((test_bit(SOCK_PASSCRED, &sock->flags) || -	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { +	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && +	    !READ_ONCE(u->addr)) {  		err = unix_autobind(sk);  		if (err)  			goto out;  	}  	err = -EMSGSIZE; -	if (len > sk->sk_sndbuf - 32) +	if (len > READ_ONCE(sk->sk_sndbuf) - 32)  		goto out;  	if (len > SKB_MAX_ALLOC) { @@ -2086,7 +2097,7 @@ restart_locked:  			unix_peer(sk) = NULL;  			unix_dgram_peer_wake_disconnect_wakeup(sk, other); -			sk->sk_state = TCP_CLOSE; +			WRITE_ONCE(sk->sk_state, TCP_CLOSE);  			unix_state_unlock(sk);  			unix_dgram_disconnected(sk, other); @@ -2217,13 +2228,15 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other  	maybe_add_creds(skb, sock, other);  	skb_get(skb); +	scm_stat_add(other, skb); + +	spin_lock(&other->sk_receive_queue.lock);  	if (ousk->oob_skb)  		consume_skb(ousk->oob_skb); -  	WRITE_ONCE(ousk->oob_skb, skb); +	__skb_queue_tail(&other->sk_receive_queue, skb); +	spin_unlock(&other->sk_receive_queue.lock); -	scm_stat_add(other, skb); -	skb_queue_tail(&other->sk_receive_queue, skb);  	sk_send_sigurg(other);  	unix_state_unlock(other);  	other->sk_data_ready(other); @@ -2261,7 +2274,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,  	}  	if (msg->msg_namelen) { -		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; +		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;  		goto out_err;  	} else {  		err = -ENOTCONN; @@ -2270,7 +2283,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,  			goto out_err;  	} -	if (sk->sk_shutdown & SEND_SHUTDOWN) +	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)  		goto pipe_err;  	while (sent < len) { @@ -2282,7 +2295,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,  						   &err, 0);  		} else {  			/* Keep two messages in the pipe so it schedules better */ -			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); +			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);  			/* allow fallback to order-0 allocations */  			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); @@ -2375,7 +2388,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,  	if (err)  		return err; -	if (sk->sk_state != TCP_ESTABLISHED) +	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)  		return -ENOTCONN;  	if (msg->msg_namelen) @@ -2389,7 +2402,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,  {  	struct sock *sk = sock->sk; -	if (sk->sk_state != TCP_ESTABLISHED) +	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)  		return -ENOTCONN;  	return unix_dgram_recvmsg(sock, msg, size, flags); @@ -2614,8 +2627,10 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state)  	mutex_lock(&u->iolock);  	unix_state_lock(sk); +	spin_lock(&sk->sk_receive_queue.lock);  	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { +		spin_unlock(&sk->sk_receive_queue.lock);  		unix_state_unlock(sk);  		mutex_unlock(&u->iolock);  		return -EINVAL; @@ -2627,6 +2642,8 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state)  		WRITE_ONCE(u->oob_skb, NULL);  	else  		skb_get(oob_skb); + +	spin_unlock(&sk->sk_receive_queue.lock);  	unix_state_unlock(sk);  	chunk = state->recv_actor(oob_skb, 0, chunk, state); @@ -2650,29 +2667,53 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,  {  	struct unix_sock *u = unix_sk(sk); -	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { -		skb_unlink(skb, &sk->sk_receive_queue); -		consume_skb(skb); -		skb = NULL; +	if (!unix_skb_len(skb)) { +		struct sk_buff *unlinked_skb = NULL; + +		spin_lock(&sk->sk_receive_queue.lock); + +		if (copied && (!u->oob_skb || skb == u->oob_skb)) { +			skb = NULL; +		} else if (flags & MSG_PEEK) { +			skb = skb_peek_next(skb, &sk->sk_receive_queue); +		} else { +			unlinked_skb = skb; +			skb = skb_peek_next(skb, &sk->sk_receive_queue); +			__skb_unlink(unlinked_skb, &sk->sk_receive_queue); +		} + +		spin_unlock(&sk->sk_receive_queue.lock); + +		consume_skb(unlinked_skb);  	} else { +		struct sk_buff *unlinked_skb = NULL; + +		spin_lock(&sk->sk_receive_queue.lock); +  		if (skb == u->oob_skb) {  			if (copied) {  				skb = NULL; -			} else if (sock_flag(sk, SOCK_URGINLINE)) { -				if (!(flags & MSG_PEEK)) { +			} else if (!(flags & MSG_PEEK)) { +				if (sock_flag(sk, SOCK_URGINLINE)) {  					WRITE_ONCE(u->oob_skb, NULL);  					consume_skb(skb); +				} else { +					__skb_unlink(skb, &sk->sk_receive_queue); +					WRITE_ONCE(u->oob_skb, NULL); +					unlinked_skb = skb; +					skb = skb_peek(&sk->sk_receive_queue);  				} -			} else if (flags & MSG_PEEK) { -				skb = NULL; -			} else { -				skb_unlink(skb, &sk->sk_receive_queue); -				WRITE_ONCE(u->oob_skb, NULL); -				if (!WARN_ON_ONCE(skb_unref(skb))) -					kfree_skb(skb); -				skb = skb_peek(&sk->sk_receive_queue); +			} else if (!sock_flag(sk, SOCK_URGINLINE)) { +				skb = skb_peek_next(skb, &sk->sk_receive_queue);  			}  		} + +		spin_unlock(&sk->sk_receive_queue.lock); + +		if (unlinked_skb) { +			WARN_ON_ONCE(skb_unref(unlinked_skb)); +			kfree_skb(unlinked_skb); +		}  	}  	return skb;  } @@ -2680,10 +2721,49 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)  { -	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) +	struct unix_sock *u = unix_sk(sk); +	struct sk_buff *skb; +	int err; + +	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))  		return -ENOTCONN; -	return unix_read_skb(sk, recv_actor); +	mutex_lock(&u->iolock); +	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); +	mutex_unlock(&u->iolock); +	if (!skb) +		return err; + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) +	if (unlikely(skb == READ_ONCE(u->oob_skb))) { +		bool drop = false; + +		unix_state_lock(sk); + +		if (sock_flag(sk, SOCK_DEAD)) { +			unix_state_unlock(sk); +			kfree_skb(skb); +			return -ECONNRESET; +		} + +		spin_lock(&sk->sk_receive_queue.lock); +		if (likely(skb == u->oob_skb)) { +			WRITE_ONCE(u->oob_skb, NULL); +			drop = true; +		} +		spin_unlock(&sk->sk_receive_queue.lock); + +		unix_state_unlock(sk); + +		if (drop) { +			WARN_ON_ONCE(skb_unref(skb)); +			kfree_skb(skb); +			return -EAGAIN; +		} +	} +#endif + +	return recv_actor(sk, skb);  }  static int unix_stream_read_generic(struct unix_stream_read_state *state, @@ -2704,7 +2784,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,  	size_t size = state->size;  	unsigned int last_len; -	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { +	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {  		err = -EINVAL;  		goto out;  	} @@ -2730,9 +2810,8 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,  	skip = max(sk_peek_offset(sk, flags), 0);  	do { -		int chunk; -		bool drop_skb;  		struct sk_buff *skb, *last; +		int chunk;  redo:  		unix_state_lock(sk); @@ -2828,11 +2907,7 @@ unlock:  		}  		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); -		skb_get(skb);  		chunk = state->recv_actor(skb, skip, chunk, state); -		drop_skb = !unix_skb_len(skb); -		/* skb is only safe to use if !drop_skb */ -		consume_skb(skb);  		if (chunk < 0) {  			if (copied == 0)  				copied = -EFAULT; @@ -2841,18 +2916,6 @@ unlock:  		copied += chunk;  		size -= chunk; -		if (drop_skb) { -			/* the skb was touched by a concurrent reader; -			 * we should not expect anything from this skb -			 * anymore and assume it invalid - we can be -			 * sure it was dropped from the socket queue -			 * -			 * let's report a short read -			 */ -			err = 0; -			break; -		} -  		/* Mark read part of skb as used */  		if (!(flags & MSG_PEEK)) {  			UNIXCB(skb).consumed += chunk; @@ -3035,7 +3098,7 @@ long unix_inq_len(struct sock *sk)  	struct sk_buff *skb;  	long amount = 0; -	if (sk->sk_state == TCP_LISTEN) +	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)  		return -EINVAL;  	spin_lock(&sk->sk_receive_queue.lock); @@ -3120,12 +3183,23 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)  	case SIOCATMARK:  		{ +			struct unix_sock *u = unix_sk(sk);  			struct sk_buff *skb;  			int answ = 0; +			mutex_lock(&u->iolock); +  			skb = skb_peek(&sk->sk_receive_queue); -			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) -				answ = 1; +			if (skb) { +				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); + +				if (skb == oob_skb || +				    (!oob_skb && !unix_skb_len(skb))) +					answ = 1; +			} + +			mutex_unlock(&u->iolock); +  			err = put_user(answ, (int __user *)arg);  		}  		break; @@ -3147,12 +3221,14 @@ static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon  static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)  {  	struct sock *sk = sock->sk; +	unsigned char state;  	__poll_t mask;  	u8 shutdown;  	sock_poll_wait(file, sock, wait);  	mask = 0;  	shutdown = READ_ONCE(sk->sk_shutdown); +	state = READ_ONCE(sk->sk_state);  	/* exceptional events? */  	if (READ_ONCE(sk->sk_err)) @@ -3174,14 +3250,14 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa  	/* Connection-based need to check for termination and startup */  	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && -	    sk->sk_state == TCP_CLOSE) +	    state == TCP_CLOSE)  		mask |= EPOLLHUP;  	/*  	 * we set writable also when the other side has shut down the  	 * connection. This prevents stuck sockets.  	 */ -	if (unix_writable(sk)) +	if (unix_writable(sk, state))  		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;  	return mask; @@ -3192,12 +3268,14 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,  {  	struct sock *sk = sock->sk, *other;  	unsigned int writable; +	unsigned char state;  	__poll_t mask;  	u8 shutdown;  	sock_poll_wait(file, sock, wait);  	mask = 0;  	shutdown = READ_ONCE(sk->sk_shutdown); +	state = READ_ONCE(sk->sk_state);  	/* exceptional events? */  	if (READ_ONCE(sk->sk_err) || @@ -3217,19 +3295,14 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,  		mask |= EPOLLIN | EPOLLRDNORM;  	/* Connection-based need to check for termination and startup */ -	if (sk->sk_type == SOCK_SEQPACKET) { -		if (sk->sk_state == TCP_CLOSE) -			mask |= EPOLLHUP; -		/* connection hasn't started yet? */ -		if (sk->sk_state == TCP_SYN_SENT) -			return mask; -	} +	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) +		mask |= EPOLLHUP;  	/* No write status requested, avoid expensive OUT tests. */  	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))  		return mask; -	writable = unix_writable(sk); +	writable = unix_writable(sk, state);  	if (writable) {  		unix_state_lock(sk); @@ -3623,6 +3696,7 @@ static int __net_init unix_net_init(struct net *net)  	for (i = 0; i < UNIX_HASH_SIZE; i++) {  		spin_lock_init(&net->unx.table.locks[i]); +		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);  		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);  	} | 
