diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-04 13:57:43 -0700 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-04 13:57:43 -0700 | 
| commit | 408afb8d7847faea115508ba154346e33edfc7d5 (patch) | |
| tree | 9b558f8477d7400ad5e8849c2624471915654ade | |
| parent | b058efc1acfd99027b4c70458e72c3d20a1a5bbc (diff) | |
| parent | 1da92779e2e8f309d5aecbbed346e7f812b174e8 (diff) | |
Merge branch 'work.aio-1' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull aio updates from Al Viro:
 "Majority of AIO stuff this cycle. aio-fsync and aio-poll, mostly.
  The only thing I'm holding back for a day or so is Adam's aio ioprio -
  his last-minute fixup is trivial (missing stub in !CONFIG_BLOCK case),
  but let it sit in -next for decency sake..."
* 'work.aio-1' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (46 commits)
  aio: sanitize the limit checking in io_submit(2)
  aio: fold do_io_submit() into callers
  aio: shift copyin of iocb into io_submit_one()
  aio_read_events_ring(): make a bit more readable
  aio: all callers of aio_{read,write,fsync,poll} treat 0 and -EIOCBQUEUED the same way
  aio: take list removal to (some) callers of aio_complete()
  aio: add missing break for the IOCB_CMD_FDSYNC case
  random: convert to ->poll_mask
  timerfd: convert to ->poll_mask
  eventfd: switch to ->poll_mask
  pipe: convert to ->poll_mask
  crypto: af_alg: convert to ->poll_mask
  net/rxrpc: convert to ->poll_mask
  net/iucv: convert to ->poll_mask
  net/phonet: convert to ->poll_mask
  net/nfc: convert to ->poll_mask
  net/caif: convert to ->poll_mask
  net/bluetooth: convert to ->poll_mask
  net/sctp: convert to ->poll_mask
  net/tipc: convert to ->poll_mask
  ...
99 files changed, 848 insertions, 594 deletions
| diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 15853d522941..2c391338c675 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -440,7 +440,9 @@ prototypes:  	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);  	int (*iterate) (struct file *, struct dir_context *);  	int (*iterate_shared) (struct file *, struct dir_context *); -	unsigned int (*poll) (struct file *, struct poll_table_struct *); +	__poll_t (*poll) (struct file *, struct poll_table_struct *); +	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); +	__poll_t (*poll_mask) (struct file *, __poll_t);  	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);  	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);  	int (*mmap) (struct file *, struct vm_area_struct *); @@ -471,7 +473,7 @@ prototypes:  };  locking rules: -	All may block. +	All except for ->poll_mask may block.  ->llseek() locking has moved from llseek to the individual llseek  implementations.  If your fs is not using generic_file_llseek, you @@ -503,6 +505,9 @@ in sys_read() and friends.  the lease within the individual filesystem to record the result of the  operation +->poll_mask can be called with or without the waitqueue lock for the waitqueue +returned from ->get_poll_head. +  --------------------------- dquot_operations -------------------------------  prototypes:  	int (*write_dquot) (struct dquot *); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 5fd325df59e2..829a7b7857a4 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -856,7 +856,9 @@ struct file_operations {  	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);  	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);  	int (*iterate) (struct file *, struct dir_context *); -	unsigned int (*poll) (struct file *, struct poll_table_struct *); +	__poll_t (*poll) (struct file *, struct poll_table_struct *); +	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); +	__poll_t (*poll_mask) (struct file *, __poll_t);  	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);  	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);  	int (*mmap) (struct file *, struct vm_area_struct *); @@ -901,6 +903,17 @@ otherwise noted.  	activity on this file and (optionally) go to sleep until there  	is activity. Called by the select(2) and poll(2) system calls +  get_poll_head: Returns the struct wait_queue_head that callers can +  wait on.  Callers need to check the returned events using ->poll_mask +  once woken.  Can return NULL to indicate polling is not supported, +  or any error code using the ERR_PTR convention to indicate that a +  grave error occured and ->poll_mask shall not be called. + +  poll_mask: return the mask of EPOLL* values describing the file descriptor +  state.  Called either before going to sleep on the waitqueue returned by +  get_poll_head, or after it has been woken.  If ->get_poll_head and +  ->poll_mask are implemented ->poll does not need to be implement. +    unlocked_ioctl: called by the ioctl(2) system call.    compat_ioctl: called by the ioctl(2) system call when 32 bit system calls diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index d6b27dab1b30..14a2f996e543 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -396,3 +396,4 @@  382	i386	pkey_free		sys_pkey_free			__ia32_sys_pkey_free  383	i386	statx			sys_statx			__ia32_sys_statx  384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl +385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 4dfe42666d0c..cd36232ab62f 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -341,6 +341,7 @@  330	common	pkey_alloc		__x64_sys_pkey_alloc  331	common	pkey_free		__x64_sys_pkey_free  332	common	statx			__x64_sys_statx +333	common	io_pgetevents		__x64_sys_io_pgetevents  #  # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 7846c0c20cfe..89ed613c017e 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -347,7 +347,6 @@ static const struct proto_ops alg_proto_ops = {  	.sendpage	=	sock_no_sendpage,  	.sendmsg	=	sock_no_sendmsg,  	.recvmsg	=	sock_no_recvmsg, -	.poll		=	sock_no_poll,  	.bind		=	alg_bind,  	.release	=	af_alg_release, @@ -1061,19 +1060,12 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err)  }  EXPORT_SYMBOL_GPL(af_alg_async_cb); -/** - * af_alg_poll - poll system call handler - */ -__poll_t af_alg_poll(struct file *file, struct socket *sock, -			 poll_table *wait) +__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk;  	struct alg_sock *ask = alg_sk(sk);  	struct af_alg_ctx *ctx = ask->private; -	__poll_t mask; - -	sock_poll_wait(file, sk_sleep(sk), wait); -	mask = 0; +	__poll_t mask = 0;  	if (!ctx->more || ctx->used)  		mask |= EPOLLIN | EPOLLRDNORM; @@ -1083,7 +1075,7 @@ __poll_t af_alg_poll(struct file *file, struct socket *sock,  	return mask;  } -EXPORT_SYMBOL_GPL(af_alg_poll); +EXPORT_SYMBOL_GPL(af_alg_poll_mask);  /**   * af_alg_alloc_areq - allocate struct af_alg_async_req diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c index 4b07edd5a9ff..330cf9f2b767 100644 --- a/crypto/algif_aead.c +++ b/crypto/algif_aead.c @@ -375,7 +375,7 @@ static struct proto_ops algif_aead_ops = {  	.sendmsg	=	aead_sendmsg,  	.sendpage	=	af_alg_sendpage,  	.recvmsg	=	aead_recvmsg, -	.poll		=	af_alg_poll, +	.poll_mask	=	af_alg_poll_mask,  };  static int aead_check_key(struct socket *sock) @@ -471,7 +471,7 @@ static struct proto_ops algif_aead_ops_nokey = {  	.sendmsg	=	aead_sendmsg_nokey,  	.sendpage	=	aead_sendpage_nokey,  	.recvmsg	=	aead_recvmsg_nokey, -	.poll		=	af_alg_poll, +	.poll_mask	=	af_alg_poll_mask,  };  static void *aead_bind(const char *name, u32 type, u32 mask) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index 6c9b1927a520..bfcf595fd8f9 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -288,7 +288,6 @@ static struct proto_ops algif_hash_ops = {  	.mmap		=	sock_no_mmap,  	.bind		=	sock_no_bind,  	.setsockopt	=	sock_no_setsockopt, -	.poll		=	sock_no_poll,  	.release	=	af_alg_release,  	.sendmsg	=	hash_sendmsg, @@ -396,7 +395,6 @@ static struct proto_ops algif_hash_ops_nokey = {  	.mmap		=	sock_no_mmap,  	.bind		=	sock_no_bind,  	.setsockopt	=	sock_no_setsockopt, -	.poll		=	sock_no_poll,  	.release	=	af_alg_release,  	.sendmsg	=	hash_sendmsg_nokey, diff --git a/crypto/algif_rng.c b/crypto/algif_rng.c index 150c2b6480ed..22df3799a17b 100644 --- a/crypto/algif_rng.c +++ b/crypto/algif_rng.c @@ -106,7 +106,6 @@ static struct proto_ops algif_rng_ops = {  	.bind		=	sock_no_bind,  	.accept		=	sock_no_accept,  	.setsockopt	=	sock_no_setsockopt, -	.poll		=	sock_no_poll,  	.sendmsg	=	sock_no_sendmsg,  	.sendpage	=	sock_no_sendpage, diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index c4e885df4564..15cf3c5222e0 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -205,7 +205,7 @@ static struct proto_ops algif_skcipher_ops = {  	.sendmsg	=	skcipher_sendmsg,  	.sendpage	=	af_alg_sendpage,  	.recvmsg	=	skcipher_recvmsg, -	.poll		=	af_alg_poll, +	.poll_mask	=	af_alg_poll_mask,  };  static int skcipher_check_key(struct socket *sock) @@ -301,7 +301,7 @@ static struct proto_ops algif_skcipher_ops_nokey = {  	.sendmsg	=	skcipher_sendmsg_nokey,  	.sendpage	=	skcipher_sendpage_nokey,  	.recvmsg	=	skcipher_recvmsg_nokey, -	.poll		=	af_alg_poll, +	.poll_mask	=	af_alg_poll_mask,  };  static void *skcipher_bind(const char *name, u32 type, u32 mask) diff --git a/drivers/char/random.c b/drivers/char/random.c index cd888d4ee605..a8fb0020ba5c 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -402,8 +402,7 @@ static struct poolinfo {  /*   * Static global variables   */ -static DECLARE_WAIT_QUEUE_HEAD(random_read_wait); -static DECLARE_WAIT_QUEUE_HEAD(random_write_wait); +static DECLARE_WAIT_QUEUE_HEAD(random_wait);  static struct fasync_struct *fasync;  static DEFINE_SPINLOCK(random_ready_list_lock); @@ -722,8 +721,8 @@ retry:  		/* should we wake readers? */  		if (entropy_bits >= random_read_wakeup_bits && -		    wq_has_sleeper(&random_read_wait)) { -			wake_up_interruptible(&random_read_wait); +		    wq_has_sleeper(&random_wait)) { +			wake_up_interruptible_poll(&random_wait, POLLIN);  			kill_fasync(&fasync, SIGIO, POLL_IN);  		}  		/* If the input pool is getting full, send some @@ -1397,7 +1396,7 @@ retry:  	trace_debit_entropy(r->name, 8 * ibytes);  	if (ibytes &&  	    (r->entropy_count >> ENTROPY_SHIFT) < random_write_wakeup_bits) { -		wake_up_interruptible(&random_write_wait); +		wake_up_interruptible_poll(&random_wait, POLLOUT);  		kill_fasync(&fasync, SIGIO, POLL_OUT);  	} @@ -1839,7 +1838,7 @@ _random_read(int nonblock, char __user *buf, size_t nbytes)  		if (nonblock)  			return -EAGAIN; -		wait_event_interruptible(random_read_wait, +		wait_event_interruptible(random_wait,  			ENTROPY_BITS(&input_pool) >=  			random_read_wakeup_bits);  		if (signal_pending(current)) @@ -1876,14 +1875,17 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)  	return ret;  } +static struct wait_queue_head * +random_get_poll_head(struct file *file, __poll_t events) +{ +	return &random_wait; +} +  static __poll_t -random_poll(struct file *file, poll_table * wait) +random_poll_mask(struct file *file, __poll_t events)  { -	__poll_t mask; +	__poll_t mask = 0; -	poll_wait(file, &random_read_wait, wait); -	poll_wait(file, &random_write_wait, wait); -	mask = 0;  	if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits)  		mask |= EPOLLIN | EPOLLRDNORM;  	if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits) @@ -1990,7 +1992,8 @@ static int random_fasync(int fd, struct file *filp, int on)  const struct file_operations random_fops = {  	.read  = random_read,  	.write = random_write, -	.poll  = random_poll, +	.get_poll_head  = random_get_poll_head, +	.poll_mask  = random_poll_mask,  	.unlocked_ioctl = random_ioctl,  	.fasync = random_fasync,  	.llseek = noop_llseek, @@ -2323,7 +2326,7 @@ void add_hwgenerator_randomness(const char *buffer, size_t count,  	 * We'll be woken up again once below random_write_wakeup_thresh,  	 * or when the calling thread is about to terminate.  	 */ -	wait_event_interruptible(random_write_wait, kthread_should_stop() || +	wait_event_interruptible(random_wait, kthread_should_stop() ||  			ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits);  	mix_pool_bytes(poolp, buffer, count);  	credit_entropy_bits(poolp, entropy); diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 1f8f489b4167..98f90aadd141 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -588,7 +588,7 @@ static const struct proto_ops data_sock_ops = {  	.getname	= data_sock_getname,  	.sendmsg	= mISDN_sock_sendmsg,  	.recvmsg	= mISDN_sock_recvmsg, -	.poll		= datagram_poll, +	.poll_mask	= datagram_poll_mask,  	.listen		= sock_no_listen,  	.shutdown	= sock_no_shutdown,  	.setsockopt	= data_sock_setsockopt, @@ -745,7 +745,6 @@ static const struct proto_ops base_sock_ops = {  	.getname	= sock_no_getname,  	.sendmsg	= sock_no_sendmsg,  	.recvmsg	= sock_no_recvmsg, -	.poll		= sock_no_poll,  	.listen		= sock_no_listen,  	.shutdown	= sock_no_shutdown,  	.setsockopt	= sock_no_setsockopt, diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index ce61231e96ea..de51e8f70f44 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -1107,7 +1107,7 @@ static const struct proto_ops pppoe_ops = {  	.socketpair	= sock_no_socketpair,  	.accept		= sock_no_accept,  	.getname	= pppoe_getname, -	.poll		= datagram_poll, +	.poll_mask	= datagram_poll_mask,  	.listen		= sock_no_listen,  	.shutdown	= sock_no_shutdown,  	.setsockopt	= sock_no_setsockopt, diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index c4267ecefd85..157b67c1bf8e 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -624,7 +624,6 @@ static const struct proto_ops pptp_ops = {  	.socketpair = sock_no_socketpair,  	.accept     = sock_no_accept,  	.getname    = pptp_getname, -	.poll       = sock_no_poll,  	.listen     = sock_no_listen,  	.shutdown   = sock_no_shutdown,  	.setsockopt = sock_no_setsockopt, diff --git a/drivers/staging/comedi/drivers/serial2002.c b/drivers/staging/comedi/drivers/serial2002.c index b3f3b4a201af..5471b2212a62 100644 --- a/drivers/staging/comedi/drivers/serial2002.c +++ b/drivers/staging/comedi/drivers/serial2002.c @@ -113,7 +113,7 @@ static void serial2002_tty_read_poll_wait(struct file *f, int timeout)  		long elapsed;  		__poll_t mask; -		mask = f->f_op->poll(f, &table.pt); +		mask = vfs_poll(f, &table.pt);  		if (mask & (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN |  			    EPOLLHUP | EPOLLERR)) {  			break; @@ -136,7 +136,7 @@ static int serial2002_tty_read(struct file *f, int timeout)  	result = -1;  	if (!IS_ERR(f)) { -		if (f->f_op->poll) { +		if (file_can_poll(f)) {  			serial2002_tty_read_poll_wait(f, timeout);  			if (kernel_read(f, &ch, 1, &pos) == 1) diff --git a/drivers/staging/ipx/af_ipx.c b/drivers/staging/ipx/af_ipx.c index 5703dd176787..208b5c161631 100644 --- a/drivers/staging/ipx/af_ipx.c +++ b/drivers/staging/ipx/af_ipx.c @@ -1965,7 +1965,7 @@ static const struct proto_ops ipx_dgram_ops = {  	.socketpair	= sock_no_socketpair,  	.accept		= sock_no_accept,  	.getname	= ipx_getname, -	.poll		= datagram_poll, +	.poll_mask	= datagram_poll_mask,  	.ioctl		= ipx_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl	= ipx_compat_ioctl, diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c index 085700f1be10..2a1be859ee71 100644 --- a/drivers/vfio/virqfd.c +++ b/drivers/vfio/virqfd.c @@ -166,7 +166,7 @@ int vfio_virqfd_enable(void *opaque,  	init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);  	init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc); -	events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt); +	events = vfs_poll(irqfd.file, &virqfd->pt);  	/*  	 * Check if there was an event already pending on the eventfd diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index f0be5f35ab28..895eaa25807c 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -208,7 +208,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file)  	if (poll->wqh)  		return 0; -	mask = file->f_op->poll(file, &poll->table); +	mask = vfs_poll(file, &poll->table);  	if (mask)  		vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));  	if (mask & EPOLLERR) { @@ -5,6 +5,7 @@   *	Implements an efficient asynchronous io interface.   *   *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved. + *	Copyright 2018 Christoph Hellwig.   *   *	See ../COPYING for licensing terms.   */ @@ -46,6 +47,8 @@  #include "internal.h" +#define KIOCB_KEY		0 +  #define AIO_RING_MAGIC			0xa10a10a1  #define AIO_RING_COMPAT_FEATURES	1  #define AIO_RING_INCOMPAT_FEATURES	0 @@ -156,21 +159,29 @@ struct kioctx {  	unsigned		id;  }; -/* - * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either - * cancelled or completed (this makes a certain amount of sense because - * successful cancellation - io_cancel() - does deliver the completion to - * userspace). - * - * And since most things don't implement kiocb cancellation and we'd really like - * kiocb completion to be lockless when possible, we use ki_cancel to - * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED - * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). - */ -#define KIOCB_CANCELLED		((void *) (~0ULL)) +struct fsync_iocb { +	struct work_struct	work; +	struct file		*file; +	bool			datasync; +}; + +struct poll_iocb { +	struct file		*file; +	__poll_t		events; +	struct wait_queue_head	*head; + +	union { +		struct wait_queue_entry	wait; +		struct work_struct	work; +	}; +};  struct aio_kiocb { -	struct kiocb		common; +	union { +		struct kiocb		rw; +		struct fsync_iocb	fsync; +		struct poll_iocb	poll; +	};  	struct kioctx		*ki_ctx;  	kiocb_cancel_fn		*ki_cancel; @@ -264,9 +275,6 @@ static int __init aio_setup(void)  	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);  	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - -	pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); -  	return 0;  }  __initcall(aio_setup); @@ -552,42 +560,20 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)  void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)  { -	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common); +	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);  	struct kioctx *ctx = req->ki_ctx;  	unsigned long flags; -	spin_lock_irqsave(&ctx->ctx_lock, flags); - -	if (!req->ki_list.next) -		list_add(&req->ki_list, &ctx->active_reqs); +	if (WARN_ON_ONCE(!list_empty(&req->ki_list))) +		return; +	spin_lock_irqsave(&ctx->ctx_lock, flags); +	list_add_tail(&req->ki_list, &ctx->active_reqs);  	req->ki_cancel = cancel; -  	spin_unlock_irqrestore(&ctx->ctx_lock, flags);  }  EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct aio_kiocb *kiocb) -{ -	kiocb_cancel_fn *old, *cancel; - -	/* -	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it -	 * actually has a cancel function, hence the cmpxchg() -	 */ - -	cancel = READ_ONCE(kiocb->ki_cancel); -	do { -		if (!cancel || cancel == KIOCB_CANCELLED) -			return -EINVAL; - -		old = cancel; -		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); -	} while (cancel != old); - -	return cancel(&kiocb->common); -} -  /*   * free_ioctx() should be RCU delayed to synchronize against the RCU   * protected lookup_ioctx() and also needs process context to call @@ -634,7 +620,7 @@ static void free_ioctx_users(struct percpu_ref *ref)  	while (!list_empty(&ctx->active_reqs)) {  		req = list_first_entry(&ctx->active_reqs,  				       struct aio_kiocb, ki_list); -		kiocb_cancel(req); +		req->ki_cancel(&req->rw);  		list_del_init(&req->ki_list);  	} @@ -1041,7 +1027,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)  		goto out_put;  	percpu_ref_get(&ctx->reqs); - +	INIT_LIST_HEAD(&req->ki_list);  	req->ki_ctx = ctx;  	return req;  out_put: @@ -1049,15 +1035,6 @@ out_put:  	return NULL;  } -static void kiocb_free(struct aio_kiocb *req) -{ -	if (req->common.ki_filp) -		fput(req->common.ki_filp); -	if (req->ki_eventfd != NULL) -		eventfd_ctx_put(req->ki_eventfd); -	kmem_cache_free(kiocb_cachep, req); -} -  static struct kioctx *lookup_ioctx(unsigned long ctx_id)  {  	struct aio_ring __user *ring  = (void __user *)ctx_id; @@ -1088,44 +1065,14 @@ out:  /* aio_complete   *	Called when the io request on the given iocb is complete.   */ -static void aio_complete(struct kiocb *kiocb, long res, long res2) +static void aio_complete(struct aio_kiocb *iocb, long res, long res2)  { -	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);  	struct kioctx	*ctx = iocb->ki_ctx;  	struct aio_ring	*ring;  	struct io_event	*ev_page, *event;  	unsigned tail, pos, head;  	unsigned long	flags; -	if (kiocb->ki_flags & IOCB_WRITE) { -		struct file *file = kiocb->ki_filp; - -		/* -		 * Tell lockdep we inherited freeze protection from submission -		 * thread. -		 */ -		if (S_ISREG(file_inode(file)->i_mode)) -			__sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); -		file_end_write(file); -	} - -	/* -	 * Special case handling for sync iocbs: -	 *  - events go directly into the iocb for fast handling -	 *  - the sync task with the iocb in its stack holds the single iocb -	 *    ref, no other paths have a way to get another ref -	 *  - the sync task helpfully left a reference to itself in the iocb -	 */ -	BUG_ON(is_sync_kiocb(kiocb)); - -	if (iocb->ki_list.next) { -		unsigned long flags; - -		spin_lock_irqsave(&ctx->ctx_lock, flags); -		list_del(&iocb->ki_list); -		spin_unlock_irqrestore(&ctx->ctx_lock, flags); -	} -  	/*  	 * Add a completion event to the ring buffer. Must be done holding  	 * ctx->completion_lock to prevent other code from messing with the tail @@ -1179,11 +1126,12 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)  	 * eventfd. The eventfd_signal() function is safe to be called  	 * from IRQ context.  	 */ -	if (iocb->ki_eventfd != NULL) +	if (iocb->ki_eventfd) {  		eventfd_signal(iocb->ki_eventfd, 1); +		eventfd_ctx_put(iocb->ki_eventfd); +	} -	/* everything turned out well, dispose of the aiocb. */ -	kiocb_free(iocb); +	kmem_cache_free(kiocb_cachep, iocb);  	/*  	 * We have to order our ring_info tail store above and test @@ -1249,14 +1197,13 @@ static long aio_read_events_ring(struct kioctx *ctx,  		if (head == tail)  			break; -		avail = min(avail, nr - ret); -		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - -			    ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); -  		pos = head + AIO_EVENTS_OFFSET;  		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];  		pos %= AIO_EVENTS_PER_PAGE; +		avail = min(avail, nr - ret); +		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); +  		ev = kmap(page);  		copy_ret = copy_to_user(event + ret, ev + pos,  					sizeof(*ev) * avail); @@ -1327,10 +1274,6 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,  		wait_event_interruptible_hrtimeout(ctx->wait,  				aio_read_events(ctx, min_nr, nr, event, &ret),  				until); - -	if (!ret && signal_pending(current)) -		ret = -EINTR; -  	return ret;  } @@ -1446,6 +1389,58 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)  	return -EINVAL;  } +static void aio_remove_iocb(struct aio_kiocb *iocb) +{ +	struct kioctx *ctx = iocb->ki_ctx; +	unsigned long flags; + +	spin_lock_irqsave(&ctx->ctx_lock, flags); +	list_del(&iocb->ki_list); +	spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} + +static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) +{ +	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); + +	if (!list_empty_careful(&iocb->ki_list)) +		aio_remove_iocb(iocb); + +	if (kiocb->ki_flags & IOCB_WRITE) { +		struct inode *inode = file_inode(kiocb->ki_filp); + +		/* +		 * Tell lockdep we inherited freeze protection from submission +		 * thread. +		 */ +		if (S_ISREG(inode->i_mode)) +			__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); +		file_end_write(kiocb->ki_filp); +	} + +	fput(kiocb->ki_filp); +	aio_complete(iocb, res, res2); +} + +static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) +{ +	int ret; + +	req->ki_filp = fget(iocb->aio_fildes); +	if (unlikely(!req->ki_filp)) +		return -EBADF; +	req->ki_complete = aio_complete_rw; +	req->ki_pos = iocb->aio_offset; +	req->ki_flags = iocb_flags(req->ki_filp); +	if (iocb->aio_flags & IOCB_FLAG_RESFD) +		req->ki_flags |= IOCB_EVENTFD; +	req->ki_hint = file_write_hint(req->ki_filp); +	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); +	if (unlikely(ret)) +		fput(req->ki_filp); +	return ret; +} +  static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,  		bool vectored, bool compat, struct iov_iter *iter)  { @@ -1465,11 +1460,11 @@ static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,  	return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);  } -static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret) +static inline void aio_rw_done(struct kiocb *req, ssize_t ret)  {  	switch (ret) {  	case -EIOCBQUEUED: -		return ret; +		break;  	case -ERESTARTSYS:  	case -ERESTARTNOINTR:  	case -ERESTARTNOHAND: @@ -1481,85 +1476,270 @@ static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)  		ret = -EINTR;  		/*FALLTHRU*/  	default: -		aio_complete(req, ret, 0); -		return 0; +		aio_complete_rw(req, ret, 0);  	}  }  static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,  		bool compat)  { -	struct file *file = req->ki_filp;  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;  	struct iov_iter iter; +	struct file *file;  	ssize_t ret; +	ret = aio_prep_rw(req, iocb); +	if (ret) +		return ret; +	file = req->ki_filp; + +	ret = -EBADF;  	if (unlikely(!(file->f_mode & FMODE_READ))) -		return -EBADF; +		goto out_fput; +	ret = -EINVAL;  	if (unlikely(!file->f_op->read_iter)) -		return -EINVAL; +		goto out_fput;  	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);  	if (ret) -		return ret; +		goto out_fput;  	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));  	if (!ret) -		ret = aio_ret(req, call_read_iter(file, req, &iter)); +		aio_rw_done(req, call_read_iter(file, req, &iter));  	kfree(iovec); +out_fput: +	if (unlikely(ret)) +		fput(file);  	return ret;  }  static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,  		bool compat)  { -	struct file *file = req->ki_filp;  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;  	struct iov_iter iter; +	struct file *file;  	ssize_t ret; +	ret = aio_prep_rw(req, iocb); +	if (ret) +		return ret; +	file = req->ki_filp; + +	ret = -EBADF;  	if (unlikely(!(file->f_mode & FMODE_WRITE))) -		return -EBADF; +		goto out_fput; +	ret = -EINVAL;  	if (unlikely(!file->f_op->write_iter)) -		return -EINVAL; +		goto out_fput;  	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);  	if (ret) -		return ret; +		goto out_fput;  	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));  	if (!ret) { -		req->ki_flags |= IOCB_WRITE; -		file_start_write(file); -		ret = aio_ret(req, call_write_iter(file, req, &iter));  		/* -		 * We release freeze protection in aio_complete().  Fool lockdep -		 * by telling it the lock got released so that it doesn't -		 * complain about held lock when we return to userspace. +		 * Open-code file_start_write here to grab freeze protection, +		 * which will be released by another thread in +		 * aio_complete_rw().  Fool lockdep by telling it the lock got +		 * released so that it doesn't complain about the held lock when +		 * we return to userspace.  		 */ -		if (S_ISREG(file_inode(file)->i_mode)) +		if (S_ISREG(file_inode(file)->i_mode)) { +			__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);  			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); +		} +		req->ki_flags |= IOCB_WRITE; +		aio_rw_done(req, call_write_iter(file, req, &iter));  	}  	kfree(iovec); +out_fput: +	if (unlikely(ret)) +		fput(file);  	return ret;  } +static void aio_fsync_work(struct work_struct *work) +{ +	struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); +	int ret; + +	ret = vfs_fsync(req->file, req->datasync); +	fput(req->file); +	aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); +} + +static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) +{ +	if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || +			iocb->aio_rw_flags)) +		return -EINVAL; +	req->file = fget(iocb->aio_fildes); +	if (unlikely(!req->file)) +		return -EBADF; +	if (unlikely(!req->file->f_op->fsync)) { +		fput(req->file); +		return -EINVAL; +	} + +	req->datasync = datasync; +	INIT_WORK(&req->work, aio_fsync_work); +	schedule_work(&req->work); +	return 0; +} + +/* need to use list_del_init so we can check if item was present */ +static inline bool __aio_poll_remove(struct poll_iocb *req) +{ +	if (list_empty(&req->wait.entry)) +		return false; +	list_del_init(&req->wait.entry); +	return true; +} + +static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) +{ +	fput(iocb->poll.file); +	aio_complete(iocb, mangle_poll(mask), 0); +} + +static void aio_poll_work(struct work_struct *work) +{ +	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work); + +	if (!list_empty_careful(&iocb->ki_list)) +		aio_remove_iocb(iocb); +	__aio_poll_complete(iocb, iocb->poll.events); +} + +static int aio_poll_cancel(struct kiocb *iocb) +{ +	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); +	struct poll_iocb *req = &aiocb->poll; +	struct wait_queue_head *head = req->head; +	bool found = false; + +	spin_lock(&head->lock); +	found = __aio_poll_remove(req); +	spin_unlock(&head->lock); + +	if (found) { +		req->events = 0; +		INIT_WORK(&req->work, aio_poll_work); +		schedule_work(&req->work); +	} +	return 0; +} + +static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, +		void *key) +{ +	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); +	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); +	struct file *file = req->file; +	__poll_t mask = key_to_poll(key); + +	assert_spin_locked(&req->head->lock); + +	/* for instances that support it check for an event match first: */ +	if (mask && !(mask & req->events)) +		return 0; + +	mask = file->f_op->poll_mask(file, req->events); +	if (!mask) +		return 0; + +	__aio_poll_remove(req); + +	/* +	 * Try completing without a context switch if we can acquire ctx_lock +	 * without spinning.  Otherwise we need to defer to a workqueue to +	 * avoid a deadlock due to the lock order. +	 */ +	if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { +		list_del_init(&iocb->ki_list); +		spin_unlock(&iocb->ki_ctx->ctx_lock); + +		__aio_poll_complete(iocb, mask); +	} else { +		req->events = mask; +		INIT_WORK(&req->work, aio_poll_work); +		schedule_work(&req->work); +	} + +	return 1; +} + +static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) +{ +	struct kioctx *ctx = aiocb->ki_ctx; +	struct poll_iocb *req = &aiocb->poll; +	__poll_t mask; + +	/* reject any unknown events outside the normal event mask. */ +	if ((u16)iocb->aio_buf != iocb->aio_buf) +		return -EINVAL; +	/* reject fields that are not defined for poll */ +	if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) +		return -EINVAL; + +	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; +	req->file = fget(iocb->aio_fildes); +	if (unlikely(!req->file)) +		return -EBADF; +	if (!file_has_poll_mask(req->file)) +		goto out_fail; + +	req->head = req->file->f_op->get_poll_head(req->file, req->events); +	if (!req->head) +		goto out_fail; +	if (IS_ERR(req->head)) { +		mask = EPOLLERR; +		goto done; +	} + +	init_waitqueue_func_entry(&req->wait, aio_poll_wake); +	aiocb->ki_cancel = aio_poll_cancel; + +	spin_lock_irq(&ctx->ctx_lock); +	spin_lock(&req->head->lock); +	mask = req->file->f_op->poll_mask(req->file, req->events); +	if (!mask) { +		__add_wait_queue(req->head, &req->wait); +		list_add_tail(&aiocb->ki_list, &ctx->active_reqs); +	} +	spin_unlock(&req->head->lock); +	spin_unlock_irq(&ctx->ctx_lock); +done: +	if (mask) +		__aio_poll_complete(aiocb, mask); +	return 0; +out_fail: +	fput(req->file); +	return -EINVAL; /* same as no support for IOCB_CMD_POLL */ +} +  static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, -			 struct iocb *iocb, bool compat) +			 bool compat)  {  	struct aio_kiocb *req; -	struct file *file; +	struct iocb iocb;  	ssize_t ret; +	if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) +		return -EFAULT; +  	/* enforce forwards compatibility on users */ -	if (unlikely(iocb->aio_reserved2)) { +	if (unlikely(iocb.aio_reserved2)) {  		pr_debug("EINVAL: reserve field set\n");  		return -EINVAL;  	}  	/* prevent overflows */  	if (unlikely( -	    (iocb->aio_buf != (unsigned long)iocb->aio_buf) || -	    (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || -	    ((ssize_t)iocb->aio_nbytes < 0) +	    (iocb.aio_buf != (unsigned long)iocb.aio_buf) || +	    (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || +	    ((ssize_t)iocb.aio_nbytes < 0)  	   )) {  		pr_debug("EINVAL: overflow check\n");  		return -EINVAL; @@ -1569,37 +1749,19 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,  	if (unlikely(!req))  		return -EAGAIN; -	req->common.ki_filp = file = fget(iocb->aio_fildes); -	if (unlikely(!req->common.ki_filp)) { -		ret = -EBADF; -		goto out_put_req; -	} -	req->common.ki_pos = iocb->aio_offset; -	req->common.ki_complete = aio_complete; -	req->common.ki_flags = iocb_flags(req->common.ki_filp); -	req->common.ki_hint = file_write_hint(file); - -	if (iocb->aio_flags & IOCB_FLAG_RESFD) { +	if (iocb.aio_flags & IOCB_FLAG_RESFD) {  		/*  		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an  		 * instance of the file* now. The file descriptor must be  		 * an eventfd() fd, and will be signaled for each completed  		 * event using the eventfd_signal() function.  		 */ -		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); +		req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);  		if (IS_ERR(req->ki_eventfd)) {  			ret = PTR_ERR(req->ki_eventfd);  			req->ki_eventfd = NULL;  			goto out_put_req;  		} - -		req->common.ki_flags |= IOCB_EVENTFD; -	} - -	ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags); -	if (unlikely(ret)) { -		pr_debug("EINVAL: aio_rw_flags\n"); -		goto out_put_req;  	}  	ret = put_user(KIOCB_KEY, &user_iocb->aio_key); @@ -1609,41 +1771,67 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,  	}  	req->ki_user_iocb = user_iocb; -	req->ki_user_data = iocb->aio_data; +	req->ki_user_data = iocb.aio_data; -	get_file(file); -	switch (iocb->aio_lio_opcode) { +	switch (iocb.aio_lio_opcode) {  	case IOCB_CMD_PREAD: -		ret = aio_read(&req->common, iocb, false, compat); +		ret = aio_read(&req->rw, &iocb, false, compat);  		break;  	case IOCB_CMD_PWRITE: -		ret = aio_write(&req->common, iocb, false, compat); +		ret = aio_write(&req->rw, &iocb, false, compat);  		break;  	case IOCB_CMD_PREADV: -		ret = aio_read(&req->common, iocb, true, compat); +		ret = aio_read(&req->rw, &iocb, true, compat);  		break;  	case IOCB_CMD_PWRITEV: -		ret = aio_write(&req->common, iocb, true, compat); +		ret = aio_write(&req->rw, &iocb, true, compat); +		break; +	case IOCB_CMD_FSYNC: +		ret = aio_fsync(&req->fsync, &iocb, false); +		break; +	case IOCB_CMD_FDSYNC: +		ret = aio_fsync(&req->fsync, &iocb, true); +		break; +	case IOCB_CMD_POLL: +		ret = aio_poll(req, &iocb);  		break;  	default: -		pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); +		pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);  		ret = -EINVAL;  		break;  	} -	fput(file); -	if (ret && ret != -EIOCBQUEUED) +	/* +	 * If ret is 0, we'd either done aio_complete() ourselves or have +	 * arranged for that to be done asynchronously.  Anything non-zero +	 * means that we need to destroy req ourselves. +	 */ +	if (ret)  		goto out_put_req;  	return 0;  out_put_req:  	put_reqs_available(ctx, 1);  	percpu_ref_put(&ctx->reqs); -	kiocb_free(req); +	if (req->ki_eventfd) +		eventfd_ctx_put(req->ki_eventfd); +	kmem_cache_free(kiocb_cachep, req);  	return ret;  } -static long do_io_submit(aio_context_t ctx_id, long nr, -			  struct iocb __user *__user *iocbpp, bool compat) +/* sys_io_submit: + *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns + *	the number of iocbs queued.  May return -EINVAL if the aio_context + *	specified by ctx_id is invalid, if nr is < 0, if the iocb at + *	*iocbpp[0] is not properly initialized, if the operation specified + *	is invalid for the file descriptor in the iocb.  May fail with + *	-EFAULT if any of the data structures point to invalid data.  May + *	fail with -EBADF if the file descriptor specified in the first + *	iocb is invalid.  May fail with -EAGAIN if insufficient resources + *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will + *	fail with -ENOSYS if not implemented. + */ +SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, +		struct iocb __user * __user *, iocbpp)  {  	struct kioctx *ctx;  	long ret = 0; @@ -1653,39 +1841,25 @@ static long do_io_submit(aio_context_t ctx_id, long nr,  	if (unlikely(nr < 0))  		return -EINVAL; -	if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) -		nr = LONG_MAX/sizeof(*iocbpp); - -	if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) -		return -EFAULT; -  	ctx = lookup_ioctx(ctx_id);  	if (unlikely(!ctx)) {  		pr_debug("EINVAL: invalid context id\n");  		return -EINVAL;  	} -	blk_start_plug(&plug); +	if (nr > ctx->nr_events) +		nr = ctx->nr_events; -	/* -	 * AKPM: should this return a partial result if some of the IOs were -	 * successfully submitted? -	 */ -	for (i=0; i<nr; i++) { +	blk_start_plug(&plug); +	for (i = 0; i < nr; i++) {  		struct iocb __user *user_iocb; -		struct iocb tmp; - -		if (unlikely(__get_user(user_iocb, iocbpp + i))) { -			ret = -EFAULT; -			break; -		} -		if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) { +		if (unlikely(get_user(user_iocb, iocbpp + i))) {  			ret = -EFAULT;  			break;  		} -		ret = io_submit_one(ctx, user_iocb, &tmp, compat); +		ret = io_submit_one(ctx, user_iocb, false);  		if (ret)  			break;  	} @@ -1695,59 +1869,44 @@ static long do_io_submit(aio_context_t ctx_id, long nr,  	return i ? i : ret;  } -/* sys_io_submit: - *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns - *	the number of iocbs queued.  May return -EINVAL if the aio_context - *	specified by ctx_id is invalid, if nr is < 0, if the iocb at - *	*iocbpp[0] is not properly initialized, if the operation specified - *	is invalid for the file descriptor in the iocb.  May fail with - *	-EFAULT if any of the data structures point to invalid data.  May - *	fail with -EBADF if the file descriptor specified in the first - *	iocb is invalid.  May fail with -EAGAIN if insufficient resources - *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will - *	fail with -ENOSYS if not implemented. - */ -SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, -		struct iocb __user * __user *, iocbpp) -{ -	return do_io_submit(ctx_id, nr, iocbpp, 0); -} -  #ifdef CONFIG_COMPAT -static inline long -copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, +		       int, nr, compat_uptr_t __user *, iocbpp)  { -	compat_uptr_t uptr; -	int i; +	struct kioctx *ctx; +	long ret = 0; +	int i = 0; +	struct blk_plug plug; -	for (i = 0; i < nr; ++i) { -		if (get_user(uptr, ptr32 + i)) -			return -EFAULT; -		if (put_user(compat_ptr(uptr), ptr64 + i)) -			return -EFAULT; +	if (unlikely(nr < 0)) +		return -EINVAL; + +	ctx = lookup_ioctx(ctx_id); +	if (unlikely(!ctx)) { +		pr_debug("EINVAL: invalid context id\n"); +		return -EINVAL;  	} -	return 0; -} -#define MAX_AIO_SUBMITS 	(PAGE_SIZE/sizeof(struct iocb *)) +	if (nr > ctx->nr_events) +		nr = ctx->nr_events; -COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, -		       int, nr, u32 __user *, iocb) -{ -	struct iocb __user * __user *iocb64; -	long ret; +	blk_start_plug(&plug); +	for (i = 0; i < nr; i++) { +		compat_uptr_t user_iocb; -	if (unlikely(nr < 0)) -		return -EINVAL; +		if (unlikely(get_user(user_iocb, iocbpp + i))) { +			ret = -EFAULT; +			break; +		} -	if (nr > MAX_AIO_SUBMITS) -		nr = MAX_AIO_SUBMITS; +		ret = io_submit_one(ctx, compat_ptr(user_iocb), true); +		if (ret) +			break; +	} +	blk_finish_plug(&plug); -	iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); -	ret = copy_iocb(nr, iocb, iocb64); -	if (!ret) -		ret = do_io_submit(ctx_id, nr, iocb64, 1); -	return ret; +	percpu_ref_put(&ctx->users); +	return i ? i : ret;  }  #endif @@ -1755,15 +1914,12 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,   *	Finds a given iocb for cancellation.   */  static struct aio_kiocb * -lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key) +lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)  {  	struct aio_kiocb *kiocb;  	assert_spin_locked(&ctx->ctx_lock); -	if (key != KIOCB_KEY) -		return NULL; -  	/* TODO: use a hash or array, this sucks. */  	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {  		if (kiocb->ki_user_iocb == iocb) @@ -1787,25 +1943,24 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,  {  	struct kioctx *ctx;  	struct aio_kiocb *kiocb; +	int ret = -EINVAL;  	u32 key; -	int ret; -	ret = get_user(key, &iocb->aio_key); -	if (unlikely(ret)) +	if (unlikely(get_user(key, &iocb->aio_key)))  		return -EFAULT; +	if (unlikely(key != KIOCB_KEY)) +		return -EINVAL;  	ctx = lookup_ioctx(ctx_id);  	if (unlikely(!ctx))  		return -EINVAL;  	spin_lock_irq(&ctx->ctx_lock); - -	kiocb = lookup_kiocb(ctx, iocb, key); -	if (kiocb) -		ret = kiocb_cancel(kiocb); -	else -		ret = -EINVAL; - +	kiocb = lookup_kiocb(ctx, iocb); +	if (kiocb) { +		ret = kiocb->ki_cancel(&kiocb->rw); +		list_del_init(&kiocb->ki_list); +	}  	spin_unlock_irq(&ctx->ctx_lock);  	if (!ret) { @@ -1860,13 +2015,60 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,  		struct timespec __user *, timeout)  {  	struct timespec64	ts; +	int			ret; -	if (timeout) { -		if (unlikely(get_timespec64(&ts, timeout))) +	if (timeout && unlikely(get_timespec64(&ts, timeout))) +		return -EFAULT; + +	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); +	if (!ret && signal_pending(current)) +		ret = -EINTR; +	return ret; +} + +SYSCALL_DEFINE6(io_pgetevents, +		aio_context_t, ctx_id, +		long, min_nr, +		long, nr, +		struct io_event __user *, events, +		struct timespec __user *, timeout, +		const struct __aio_sigset __user *, usig) +{ +	struct __aio_sigset	ksig = { NULL, }; +	sigset_t		ksigmask, sigsaved; +	struct timespec64	ts; +	int ret; + +	if (timeout && unlikely(get_timespec64(&ts, timeout))) +		return -EFAULT; + +	if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) +		return -EFAULT; + +	if (ksig.sigmask) { +		if (ksig.sigsetsize != sizeof(sigset_t)) +			return -EINVAL; +		if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))  			return -EFAULT; +		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); +		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); +	} + +	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); +	if (signal_pending(current)) { +		if (ksig.sigmask) { +			current->saved_sigmask = sigsaved; +			set_restore_sigmask(); +		} + +		if (!ret) +			ret = -ERESTARTNOHAND; +	} else { +		if (ksig.sigmask) +			sigprocmask(SIG_SETMASK, &sigsaved, NULL);  	} -	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); +	return ret;  }  #ifdef CONFIG_COMPAT @@ -1877,13 +2079,64 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,  		       struct compat_timespec __user *, timeout)  {  	struct timespec64 t; +	int ret; + +	if (timeout && compat_get_timespec64(&t, timeout)) +		return -EFAULT; + +	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); +	if (!ret && signal_pending(current)) +		ret = -EINTR; +	return ret; +} + + +struct __compat_aio_sigset { +	compat_sigset_t __user	*sigmask; +	compat_size_t		sigsetsize; +}; + +COMPAT_SYSCALL_DEFINE6(io_pgetevents, +		compat_aio_context_t, ctx_id, +		compat_long_t, min_nr, +		compat_long_t, nr, +		struct io_event __user *, events, +		struct compat_timespec __user *, timeout, +		const struct __compat_aio_sigset __user *, usig) +{ +	struct __compat_aio_sigset ksig = { NULL, }; +	sigset_t ksigmask, sigsaved; +	struct timespec64 t; +	int ret; + +	if (timeout && compat_get_timespec64(&t, timeout)) +		return -EFAULT; + +	if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) +		return -EFAULT; -	if (timeout) { -		if (compat_get_timespec64(&t, timeout)) +	if (ksig.sigmask) { +		if (ksig.sigsetsize != sizeof(compat_sigset_t)) +			return -EINVAL; +		if (get_compat_sigset(&ksigmask, ksig.sigmask))  			return -EFAULT; +		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); +		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); +	} +	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); +	if (signal_pending(current)) { +		if (ksig.sigmask) { +			current->saved_sigmask = sigsaved; +			set_restore_sigmask(); +		} +		if (!ret) +			ret = -ERESTARTNOHAND; +	} else { +		if (ksig.sigmask) +			sigprocmask(SIG_SETMASK, &sigsaved, NULL);  	} -	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); +	return ret;  }  #endif diff --git a/fs/eventfd.c b/fs/eventfd.c index 08d3bd602f73..61c9514da5e9 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -101,14 +101,20 @@ static int eventfd_release(struct inode *inode, struct file *file)  	return 0;  } -static __poll_t eventfd_poll(struct file *file, poll_table *wait) +static struct wait_queue_head * +eventfd_get_poll_head(struct file *file, __poll_t events) +{ +	struct eventfd_ctx *ctx = file->private_data; + +	return &ctx->wqh; +} + +static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask)  {  	struct eventfd_ctx *ctx = file->private_data;  	__poll_t events = 0;  	u64 count; -	poll_wait(file, &ctx->wqh, wait); -  	/*  	 * All writes to ctx->count occur within ctx->wqh.lock.  This read  	 * can be done outside ctx->wqh.lock because we know that poll_wait @@ -305,7 +311,8 @@ static const struct file_operations eventfd_fops = {  	.show_fdinfo	= eventfd_show_fdinfo,  #endif  	.release	= eventfd_release, -	.poll		= eventfd_poll, +	.get_poll_head	= eventfd_get_poll_head, +	.poll_mask	= eventfd_poll_mask,  	.read		= eventfd_read,  	.write		= eventfd_write,  	.llseek		= noop_llseek, diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 602ca4285b2e..67db22fe99c5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -884,8 +884,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,  	pt->_key = epi->event.events;  	if (!is_file_epoll(epi->ffd.file)) -		return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & -		       epi->event.events; +		return vfs_poll(epi->ffd.file, pt) & epi->event.events;  	ep = epi->ffd.file->private_data;  	poll_wait(epi->ffd.file, &ep->poll_wait, pt); @@ -2025,7 +2024,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,  	/* The target file descriptor must support poll */  	error = -EPERM; -	if (!tf.file->f_op->poll) +	if (!file_can_poll(tf.file))  		goto error_tgt_fput;  	/* Check if EPOLLWAKEUP is allowed */ diff --git a/fs/pipe.c b/fs/pipe.c index 39d6f431da83..bb0840e234f3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -509,19 +509,22 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  	}  } -/* No kernel lock held - fine */ -static __poll_t -pipe_poll(struct file *filp, poll_table *wait) +static struct wait_queue_head * +pipe_get_poll_head(struct file *filp, __poll_t events)  { -	__poll_t mask;  	struct pipe_inode_info *pipe = filp->private_data; -	int nrbufs; -	poll_wait(filp, &pipe->wait, wait); +	return &pipe->wait; +} + +/* No kernel lock held - fine */ +static __poll_t pipe_poll_mask(struct file *filp, __poll_t events) +{ +	struct pipe_inode_info *pipe = filp->private_data; +	int nrbufs = pipe->nrbufs; +	__poll_t mask = 0;  	/* Reading only -- no need for acquiring the semaphore.  */ -	nrbufs = pipe->nrbufs; -	mask = 0;  	if (filp->f_mode & FMODE_READ) {  		mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;  		if (!pipe->writers && filp->f_version != pipe->w_counter) @@ -1020,7 +1023,8 @@ const struct file_operations pipefifo_fops = {  	.llseek		= no_llseek,  	.read_iter	= pipe_read,  	.write_iter	= pipe_write, -	.poll		= pipe_poll, +	.get_poll_head	= pipe_get_poll_head, +	.poll_mask	= pipe_poll_mask,  	.unlocked_ioctl	= pipe_ioctl,  	.release	= pipe_release,  	.fasync		= pipe_fasync, diff --git a/fs/select.c b/fs/select.c index ba879c51288f..bc3cc0f98896 100644 --- a/fs/select.c +++ b/fs/select.c @@ -34,6 +34,29 @@  #include <linux/uaccess.h> +__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) +{ +	if (file->f_op->poll) { +		return file->f_op->poll(file, pt); +	} else if (file_has_poll_mask(file)) { +		unsigned int events = poll_requested_events(pt); +		struct wait_queue_head *head; + +		if (pt && pt->_qproc) { +			head = file->f_op->get_poll_head(file, events); +			if (!head) +				return DEFAULT_POLLMASK; +			if (IS_ERR(head)) +				return EPOLLERR; +			pt->_qproc(file, head, pt); +		} + +		return file->f_op->poll_mask(file, events); +	} else { +		return DEFAULT_POLLMASK; +	} +} +EXPORT_SYMBOL_GPL(vfs_poll);  /*   * Estimate expected accuracy in ns from a timeval. @@ -233,7 +256,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,  	add_wait_queue(wait_address, &entry->wait);  } -int poll_schedule_timeout(struct poll_wqueues *pwq, int state, +static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,  			  ktime_t *expires, unsigned long slack)  {  	int rc = -EINTR; @@ -258,7 +281,6 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,  	return rc;  } -EXPORT_SYMBOL(poll_schedule_timeout);  /**   * poll_select_set_timeout - helper function to setup the timeout value @@ -503,14 +525,10 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)  					continue;  				f = fdget(i);  				if (f.file) { -					const struct file_operations *f_op; -					f_op = f.file->f_op; -					mask = DEFAULT_POLLMASK; -					if (f_op->poll) { -						wait_key_set(wait, in, out, -							     bit, busy_flag); -						mask = (*f_op->poll)(f.file, wait); -					} +					wait_key_set(wait, in, out, bit, +						     busy_flag); +					mask = vfs_poll(f.file, wait); +  					fdput(f);  					if ((mask & POLLIN_SET) && (in & bit)) {  						res_in |= bit; @@ -813,34 +831,29 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,  				     bool *can_busy_poll,  				     __poll_t busy_flag)  { -	__poll_t mask; -	int fd; +	int fd = pollfd->fd; +	__poll_t mask = 0, filter; +	struct fd f; -	mask = 0; -	fd = pollfd->fd; -	if (fd >= 0) { -		struct fd f = fdget(fd); -		mask = EPOLLNVAL; -		if (f.file) { -			/* userland u16 ->events contains POLL... bitmap */ -			__poll_t filter = demangle_poll(pollfd->events) | -						EPOLLERR | EPOLLHUP; -			mask = DEFAULT_POLLMASK; -			if (f.file->f_op->poll) { -				pwait->_key = filter; -				pwait->_key |= busy_flag; -				mask = f.file->f_op->poll(f.file, pwait); -				if (mask & busy_flag) -					*can_busy_poll = true; -			} -			/* Mask out unneeded events. */ -			mask &= filter; -			fdput(f); -		} -	} +	if (fd < 0) +		goto out; +	mask = EPOLLNVAL; +	f = fdget(fd); +	if (!f.file) +		goto out; + +	/* userland u16 ->events contains POLL... bitmap */ +	filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; +	pwait->_key = filter | busy_flag; +	mask = vfs_poll(f.file, pwait); +	if (mask & busy_flag) +		*can_busy_poll = true; +	mask &= filter;		/* Mask out unneeded events. */ +	fdput(f); + +out:  	/* ... and so does ->revents */  	pollfd->revents = mangle_poll(mask); -  	return mask;  } diff --git a/fs/timerfd.c b/fs/timerfd.c index cdad49da3ff7..d84a2bee4f82 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -226,21 +226,20 @@ static int timerfd_release(struct inode *inode, struct file *file)  	kfree_rcu(ctx, rcu);  	return 0;  } - -static __poll_t timerfd_poll(struct file *file, poll_table *wait) +	 +static struct wait_queue_head *timerfd_get_poll_head(struct file *file, +		__poll_t eventmask)  {  	struct timerfd_ctx *ctx = file->private_data; -	__poll_t events = 0; -	unsigned long flags; -	poll_wait(file, &ctx->wqh, wait); +	return &ctx->wqh; +} -	spin_lock_irqsave(&ctx->wqh.lock, flags); -	if (ctx->ticks) -		events |= EPOLLIN; -	spin_unlock_irqrestore(&ctx->wqh.lock, flags); +static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask) +{ +	struct timerfd_ctx *ctx = file->private_data; -	return events; +	return ctx->ticks ? EPOLLIN : 0;  }  static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, @@ -364,7 +363,8 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg  static const struct file_operations timerfd_fops = {  	.release	= timerfd_release, -	.poll		= timerfd_poll, +	.get_poll_head	= timerfd_get_poll_head, +	.poll_mask	= timerfd_poll_mask,  	.read		= timerfd_read,  	.llseek		= noop_llseek,  	.show_fdinfo	= timerfd_show, diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index 482461d8931d..cc414db9da0a 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -245,8 +245,7 @@ ssize_t af_alg_sendpage(struct socket *sock, struct page *page,  			int offset, size_t size, int flags);  void af_alg_free_resources(struct af_alg_async_req *areq);  void af_alg_async_cb(struct crypto_async_request *_req, int err); -__poll_t af_alg_poll(struct file *file, struct socket *sock, -			 poll_table *wait); +__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events);  struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk,  					   unsigned int areqlen);  int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags, diff --git a/include/linux/aio.h b/include/linux/aio.h index 9d8aabecfe2d..b83e68dd006f 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -8,8 +8,6 @@ struct kioctx;  struct kiocb;  struct mm_struct; -#define KIOCB_KEY		0 -  typedef int (kiocb_cancel_fn)(struct kiocb *);  /* prototypes */ diff --git a/include/linux/compat.h b/include/linux/compat.h index 081281ad5772..ad192057b887 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -330,6 +330,7 @@ extern int put_compat_rusage(const struct rusage *,  			     struct compat_rusage __user *);  struct compat_siginfo; +struct __compat_aio_sigset;  struct compat_dirent {  	u32		d_ino; @@ -553,6 +554,12 @@ asmlinkage long compat_sys_io_getevents(compat_aio_context_t ctx_id,  					compat_long_t nr,  					struct io_event __user *events,  					struct compat_timespec __user *timeout); +asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id, +					compat_long_t min_nr, +					compat_long_t nr, +					struct io_event __user *events, +					struct compat_timespec __user *timeout, +					const struct __compat_aio_sigset __user *usig);  /* fs/cookies.c */  asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, compat_size_t); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7f86730b67a9..d4c37d371da5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1711,6 +1711,8 @@ struct file_operations {  	int (*iterate) (struct file *, struct dir_context *);  	int (*iterate_shared) (struct file *, struct dir_context *);  	__poll_t (*poll) (struct file *, struct poll_table_struct *); +	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); +	__poll_t (*poll_mask) (struct file *, __poll_t);  	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);  	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);  	int (*mmap) (struct file *, struct vm_area_struct *); diff --git a/include/linux/net.h b/include/linux/net.h index 2248a052061d..3fd9d8c16581 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -147,6 +147,7 @@ struct proto_ops {  	int		(*getname)   (struct socket *sock,  				      struct sockaddr *addr,  				      int peer); +	__poll_t	(*poll_mask) (struct socket *sock, __poll_t events);  	__poll_t	(*poll)	     (struct file *file, struct socket *sock,  				      struct poll_table_struct *wait);  	int		(*ioctl)     (struct socket *sock, unsigned int cmd, diff --git a/include/linux/poll.h b/include/linux/poll.h index f45ebd017eaa..fdf86b4cbc71 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -74,6 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)  	pt->_key   = ~(__poll_t)0; /* all events enabled */  } +static inline bool file_has_poll_mask(struct file *file) +{ +	return file->f_op->get_poll_head && file->f_op->poll_mask; +} + +static inline bool file_can_poll(struct file *file) +{ +	return file->f_op->poll || file_has_poll_mask(file); +} + +__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt); +  struct poll_table_entry {  	struct file *filp;  	__poll_t key; @@ -96,8 +108,6 @@ struct poll_wqueues {  extern void poll_initwait(struct poll_wqueues *pwq);  extern void poll_freewait(struct poll_wqueues *pwq); -extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, -				 ktime_t *expires, unsigned long slack);  extern u64 select_estimate_accuracy(struct timespec64 *tv);  #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9065477ed255..89198379b39d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3250,8 +3250,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,  				    int *peeked, int *off, int *err);  struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,  				  int *err); -__poll_t datagram_poll(struct file *file, struct socket *sock, -			   struct poll_table_struct *wait); +__poll_t datagram_poll_mask(struct socket *sock, __poll_t events);  int skb_copy_datagram_iter(const struct sk_buff *from, int offset,  			   struct iov_iter *to, int size);  static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 70fcda1a9049..811172fcb916 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -290,6 +290,12 @@ asmlinkage long sys_io_getevents(aio_context_t ctx_id,  				long nr,  				struct io_event __user *events,  				struct timespec __user *timeout); +asmlinkage long sys_io_pgetevents(aio_context_t ctx_id, +				long min_nr, +				long nr, +				struct io_event __user *events, +				struct timespec __user *timeout, +				const struct __aio_sigset *sig);  /* fs/xattr.c */  asmlinkage long sys_setxattr(const char __user *path, const char __user *name, diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index ec9d6bc65855..53ce8176c313 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -271,7 +271,7 @@ int  bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,  		     int flags);  int  bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg,  			    size_t len, int flags); -__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); +__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events);  int  bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);  int  bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo);  int  bt_sock_wait_ready(struct sock *sk, unsigned long flags); diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 71c72a939bf8..c5187438af38 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -121,6 +121,21 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)  #endif  } +static inline void sock_poll_busy_loop(struct socket *sock, __poll_t events) +{ +	if (sk_can_busy_loop(sock->sk) && +	    events && (events & POLL_BUSY_LOOP)) { +		/* once, only if requested by syscall */ +		sk_busy_loop(sock->sk, 1); +	} +} + +/* if this socket can poll_ll, tell the system call */ +static inline __poll_t sock_poll_busy_flag(struct socket *sock) +{ +	return sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0; +} +  /* used in the NIC receive handler to mark the skb */  static inline void skb_mark_napi_id(struct sk_buff *skb,  				    struct napi_struct *napi) diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h index f4c21b5a1242..b0eaeb02d46d 100644 --- a/include/net/iucv/af_iucv.h +++ b/include/net/iucv/af_iucv.h @@ -153,8 +153,6 @@ struct iucv_sock_list {  	atomic_t	  autobind_name;  }; -__poll_t iucv_sock_poll(struct file *file, struct socket *sock, -			    poll_table *wait);  void iucv_sock_link(struct iucv_sock_list *l, struct sock *s);  void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s);  void iucv_accept_enqueue(struct sock *parent, struct sock *sk); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 35498e613ff5..e6d349b2a791 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -109,8 +109,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb);  int sctp_inet_listen(struct socket *sock, int backlog);  void sctp_write_space(struct sock *sk);  void sctp_data_ready(struct sock *sk); -__poll_t sctp_poll(struct file *file, struct socket *sock, -		poll_table *wait); +__poll_t sctp_poll_mask(struct socket *sock, __poll_t events);  void sctp_sock_rfree(struct sk_buff *skb);  void sctp_copy_sock(struct sock *newsk, struct sock *sk,  		    struct sctp_association *asoc); diff --git a/include/net/sock.h b/include/net/sock.h index 74d725fdbe0f..4d2e8ad98985 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1591,8 +1591,6 @@ int sock_no_connect(struct socket *, struct sockaddr *, int, int);  int sock_no_socketpair(struct socket *, struct socket *);  int sock_no_accept(struct socket *, struct socket *, int, bool);  int sock_no_getname(struct socket *, struct sockaddr *, int); -__poll_t sock_no_poll(struct file *, struct socket *, -			  struct poll_table_struct *);  int sock_no_ioctl(struct socket *, unsigned int, unsigned long);  int sock_no_listen(struct socket *, int);  int sock_no_shutdown(struct socket *, int); diff --git a/include/net/tcp.h b/include/net/tcp.h index 51dc7a26a2fa..f88f8a2cab0d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -388,8 +388,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);  void tcp_close(struct sock *sk, long timeout);  void tcp_init_sock(struct sock *sk);  void tcp_init_transfer(struct sock *sk, int bpf_op); -__poll_t tcp_poll(struct file *file, struct socket *sock, -		      struct poll_table_struct *wait); +__poll_t tcp_poll_mask(struct socket *sock, __poll_t events);  int tcp_getsockopt(struct sock *sk, int level, int optname,  		   char __user *optval, int __user *optlen);  int tcp_setsockopt(struct sock *sk, int level, int optname, diff --git a/include/net/udp.h b/include/net/udp.h index 621778b80e3d..d8ca3b26964d 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -276,7 +276,7 @@ int udp_init_sock(struct sock *sk);  int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);  int __udp_disconnect(struct sock *sk, int flags);  int udp_disconnect(struct sock *sk, int flags); -__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); +__poll_t udp_poll_mask(struct socket *sock, __poll_t events);  struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,  				       netdev_features_t features,  				       bool is_ipv6); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 8bcb186c6f67..42990676a55e 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -732,9 +732,11 @@ __SYSCALL(__NR_pkey_alloc,    sys_pkey_alloc)  __SYSCALL(__NR_pkey_free,     sys_pkey_free)  #define __NR_statx 291  __SYSCALL(__NR_statx,     sys_statx) +#define __NR_io_pgetevents 292 +__SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)  #undef __NR_syscalls -#define __NR_syscalls 292 +#define __NR_syscalls 293  /*   * 32 bit systems traditionally used different diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index a04adbc70ddf..ed0185945bb2 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -29,6 +29,7 @@  #include <linux/types.h>  #include <linux/fs.h> +#include <linux/signal.h>  #include <asm/byteorder.h>  typedef __kernel_ulong_t aio_context_t; @@ -38,10 +39,8 @@ enum {  	IOCB_CMD_PWRITE = 1,  	IOCB_CMD_FSYNC = 2,  	IOCB_CMD_FDSYNC = 3, -	/* These two are experimental. -	 * IOCB_CMD_PREADX = 4, -	 * IOCB_CMD_POLL = 5, -	 */ +	/* 4 was the experimental IOCB_CMD_PREADX */ +	IOCB_CMD_POLL = 5,  	IOCB_CMD_NOOP = 6,  	IOCB_CMD_PREADV = 7,  	IOCB_CMD_PWRITEV = 8, @@ -108,5 +107,10 @@ struct iocb {  #undef IFBIG  #undef IFLITTLE +struct __aio_sigset { +	sigset_t __user	*sigmask; +	size_t		sigsetsize; +}; +  #endif /* __LINUX__AIO_ABI_H */ diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h index cd4f0b897a48..2fce8b6876e9 100644 --- a/include/uapi/linux/types.h +++ b/include/uapi/linux/types.h @@ -49,11 +49,7 @@ typedef __u32 __bitwise __wsum;  #define __aligned_be64 __be64 __attribute__((aligned(8)))  #define __aligned_le64 __le64 __attribute__((aligned(8))) -#ifdef __CHECK_POLL  typedef unsigned __bitwise __poll_t; -#else -typedef unsigned __poll_t; -#endif  #endif /*  __ASSEMBLY__ */  #endif /* _UAPI_LINUX_TYPES_H */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 9791364925dc..183169c2a75b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -43,7 +43,9 @@ COND_SYSCALL(io_submit);  COND_SYSCALL_COMPAT(io_submit);  COND_SYSCALL(io_cancel);  COND_SYSCALL(io_getevents); +COND_SYSCALL(io_pgetevents);  COND_SYSCALL_COMPAT(io_getevents); +COND_SYSCALL_COMPAT(io_pgetevents);  /* fs/xattr.c */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bd3df3d101a..1695f38630f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3849,7 +3849,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,  	if (ret)  		goto out_put_css; -	efile.file->f_op->poll(efile.file, &event->pt); +	vfs_poll(efile.file, &event->pt);  	spin_lock(&memcg->event_list_lock);  	list_add(&event->list, &memcg->event_list); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 848969fe7979..588bf88c3305 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -231,7 +231,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)  static __poll_t  p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)  { -	__poll_t ret, n; +	__poll_t ret;  	struct p9_trans_fd *ts = NULL;  	if (client && client->status == Connected) @@ -243,19 +243,9 @@ p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)  		return EPOLLERR;  	} -	if (!ts->rd->f_op->poll) -		ret = DEFAULT_POLLMASK; -	else -		ret = ts->rd->f_op->poll(ts->rd, pt); - -	if (ts->rd != ts->wr) { -		if (!ts->wr->f_op->poll) -			n = DEFAULT_POLLMASK; -		else -			n = ts->wr->f_op->poll(ts->wr, pt); -		ret = (ret & ~EPOLLOUT) | (n & ~EPOLLIN); -	} - +	ret = vfs_poll(ts->rd, pt); +	if (ts->rd != ts->wr) +		ret = (ret & ~EPOLLOUT) | (vfs_poll(ts->wr, pt) & ~EPOLLIN);  	return ret;  } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 9b6bc5abe946..55fdba05d7d9 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = {  	.socketpair	= sock_no_socketpair,  	.accept		= sock_no_accept,  	.getname	= atalk_getname, -	.poll		= datagram_poll, +	.poll_mask	= datagram_poll_mask,  	.ioctl		= atalk_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl	= atalk_compat_ioctl, diff --git a/net/atm/common.c b/net/atm/common.c index fc78a0508ae1..1f2af59935db 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -648,16 +648,11 @@ out:  	return error;  } -__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t vcc_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk; -	struct atm_vcc *vcc; -	__poll_t mask; - -	sock_poll_wait(file, sk_sleep(sk), wait); -	mask = 0; - -	vcc = ATM_SD(sock); +	struct atm_vcc *vcc = ATM_SD(sock); +	__poll_t mask = 0;  	/* exceptional events */  	if (sk->sk_err) diff --git a/net/atm/common.h b/net/atm/common.h index 5850649068bb..526796ad230f 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci);  int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,  		int flags);  int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len); -__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait); +__poll_t vcc_poll_mask(struct socket *sock, __poll_t events);  int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);  int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);  int vcc_setsockopt(struct socket *sock, int level, int optname, diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 2cb10af16afc..9f75092fe778 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -113,7 +113,7 @@ static const struct proto_ops pvc_proto_ops = {  	.socketpair =	sock_no_socketpair,  	.accept =	sock_no_accept,  	.getname =	pvc_getname, -	.poll =		vcc_poll, +	.poll_mask =	vcc_poll_mask,  	.ioctl =	vcc_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl = vcc_compat_ioctl, diff --git a/net/atm/svc.c b/net/atm/svc.c index 2f91b766ac42..53f4ad7087b1 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -636,7 +636,7 @@ static const struct proto_ops svc_proto_ops = {  	.socketpair =	sock_no_socketpair,  	.accept =	svc_accept,  	.getname =	svc_getname, -	.poll =		vcc_poll, +	.poll_mask =	vcc_poll_mask,  	.ioctl =	svc_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl =	svc_compat_ioctl, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c603d33d5410..d1d2442ce573 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1941,7 +1941,7 @@ static const struct proto_ops ax25_proto_ops = {  	.socketpair	= sock_no_socketpair,  	.accept		= ax25_accept,  	.getname	= ax25_getname, -	.poll		= datagram_poll, +	.poll_mask	= datagram_poll_mask,  	.ioctl		= ax25_ioctl,  	.listen		= ax25_listen,  	.shutdown	= ax25_shutdown, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 3264e1873219..510ab4f55df5 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -437,16 +437,13 @@ static inline __poll_t bt_accept_poll(struct sock *parent)  	return 0;  } -__poll_t bt_sock_poll(struct file *file, struct socket *sock, -			  poll_table *wait) +__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk;  	__poll_t mask = 0;  	BT_DBG("sock %p, sk %p", sock, sk); -	poll_wait(file, sk_sleep(sk), wait); -  	if (sk->sk_state == BT_LISTEN)  		return bt_accept_poll(sk); @@ -478,7 +475,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,  	return mask;  } -EXPORT_SYMBOL(bt_sock_poll); +EXPORT_SYMBOL(bt_sock_poll_mask);  int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)  { diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index b5116fa9835e..00deacdcb51c 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -175,7 +175,6 @@ static const struct proto_ops bnep_sock_ops = {  	.getname	= sock_no_getname,  	.sendmsg	= sock_no_sendmsg,  	.recvmsg	= sock_no_recvmsg, -	.poll		= sock_no_poll,  	.listen		= sock_no_listen,  	.shutdown	= sock_no_shutdown,  	.setsockopt	= sock_no_setsockopt, diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index ce86a7bae844..e08f28fadd65 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -178,7 +178,6 @@ static const struct proto_ops cmtp_sock_ops = {  	.getname	= sock_no_getname,  	.sendmsg	= sock_no_sendmsg,  	.recvmsg	= sock_no_recvmsg, -	.poll		= sock_no_poll,  	.listen		= sock_no_listen,  	.shutdown	= sock_no_shutdown,  	.setsockopt	= sock_no_setsockopt, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1506e1632394..d6c099861538 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = {  	.sendmsg	= hci_sock_sendmsg,  	.recvmsg	= hci_sock_recvmsg,  	.ioctl		= hci_sock_ioctl, -	.poll		= datagram_poll, +	.poll_mask	= datagram_poll_mask,  	.listen		= sock_no_listen,  	.shutdown	= sock_no_shutdown,  	.setsockopt	= hci_sock_setsockopt, diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 008ba439bd62..1eaac01f85de 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -208,7 +208,6 @@ static const struct proto_ops hidp_sock_ops = {  	.getname	= sock_no_getname,  	.sendmsg	= sock_no_sendmsg,  	.recvmsg	= sock_no_recvmsg, -	.poll		= sock_no_poll,  	.listen		= sock_no_listen,  	.shutdown	= sock_no_shutdown,  	.setsockopt	= sock_no_setsockopt, diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 686bdc6b35b0..742a190034e6 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = {  	.getname	= l2cap_sock_getname,  	.sendmsg	= l2cap_sock_sendmsg,  	.recvmsg	= l2cap_sock_recvmsg, -	.poll		= bt_sock_poll, +	.poll_mask	= bt_sock_poll_mask,  	.ioctl		= bt_sock_ioctl,  	.mmap		= sock_no_mmap,  	.socketpair	= sock_no_socketpair, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index d606e9212291..1cf57622473a 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = {  	.setsockopt	= rfcomm_sock_setsockopt,  	.getsockopt	= rfcomm_sock_getsockopt,  	.ioctl		= rfcomm_sock_ioctl, -	.poll		= bt_sock_poll, +	.poll_mask	= bt_sock_poll_mask,  	.socketpair	= sock_no_socketpair,  	.mmap		= sock_no_mmap  }; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 413b8ee49fec..d60dbc61d170 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1197,7 +1197,7 @@ static const struct proto_ops sco_sock_ops = {  	.getname	= sco_sock_getname,  	.sendmsg	= sco_sock_sendmsg,  	.recvmsg	= sco_sock_recvmsg, -	.poll		= bt_sock_poll, +	.poll_mask	= bt_sock_poll_mask,  	.ioctl		= bt_sock_ioctl,  	.mmap		= sock_no_mmap,  	.socketpair	= sock_no_socketpair, diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index a6fb1b3bcad9..c7991867d622 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -934,15 +934,11 @@ static int caif_release(struct socket *sock)  }  /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ -static __poll_t caif_poll(struct file *file, -			      struct socket *sock, poll_table *wait) +static __poll_t caif_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk; -	__poll_t mask;  	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - -	sock_poll_wait(file, sk_sleep(sk), wait); -	mask = 0; +	__poll_t mask = 0;  	/* exceptional events? */  	if (sk->sk_err) @@ -976,7 +972,7 @@ static const struct proto_ops caif_seqpacket_ops = {  	.socketpair = sock_no_socketpair,  	.accept = sock_no_accept,  	.getname = sock_no_getname, -	.poll = caif_poll, +	.poll_mask = caif_poll_mask,  	.ioctl = sock_no_ioctl,  	.listen = sock_no_listen,  	.shutdown = sock_no_shutdown, @@ -997,7 +993,7 @@ static const struct proto_ops caif_stream_ops = {  	.socketpair = sock_no_socketpair,  	.accept = sock_no_accept,  	.getname = sock_no_getname, -	.poll = caif_poll, +	.poll_mask = caif_poll_mask,  	.ioctl = sock_no_ioctl,  	.listen = sock_no_listen,  	.shutdown = sock_no_shutdown, diff --git a/net/can/bcm.c b/net/can/bcm.c index 6ad89f49b341..97fedff3f0c4 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1657,7 +1657,7 @@ static const struct proto_ops bcm_ops = {  	.socketpair    = sock_no_socketpair,  	.accept        = sock_no_accept,  	.getname       = sock_no_getname, -	.poll          = datagram_poll, +	.poll_mask     = datagram_poll_mask,  	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */  	.listen        = sock_no_listen,  	.shutdown      = sock_no_shutdown, diff --git a/net/can/raw.c b/net/can/raw.c index 1051eee82581..fd7e2f49ea6a 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = {  	.socketpair    = sock_no_socketpair,  	.accept        = sock_no_accept,  	.getname       = raw_getname, -	.poll          = datagram_poll, +	.poll_mask     = datagram_poll_mask,  	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */  	.listen        = sock_no_listen,  	.shutdown      = sock_no_shutdown, diff --git a/net/core/datagram.c b/net/core/datagram.c index 9938952c5c78..f19bf3dc2bd6 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -819,9 +819,8 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);  /**   * 	datagram_poll - generic datagram poll - *	@file: file struct   *	@sock: socket - *	@wait: poll table + *	@events to wait for   *   *	Datagram poll: Again totally generic. This also handles   *	sequenced packet sockets providing the socket receive queue @@ -831,14 +830,10 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);   *	and you use a different write policy from sock_writeable()   *	then please supply your own write_space callback.   */ -__poll_t datagram_poll(struct file *file, struct socket *sock, -			   poll_table *wait) +__poll_t datagram_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk; -	__poll_t mask; - -	sock_poll_wait(file, sk_sleep(sk), wait); -	mask = 0; +	__poll_t mask = 0;  	/* exceptional events? */  	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -871,4 +866,4 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,  	return mask;  } -EXPORT_SYMBOL(datagram_poll); +EXPORT_SYMBOL(datagram_poll_mask); diff --git a/net/core/sock.c b/net/core/sock.c index 815770333d91..2aed99a541d5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2567,12 +2567,6 @@ int sock_no_getname(struct socket *sock, struct sockaddr *saddr,  }  EXPORT_SYMBOL(sock_no_getname); -__poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) -{ -	return 0; -} -EXPORT_SYMBOL(sock_no_poll); -  int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)  {  	return -EOPNOTSUPP; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index f91e3816806b..0ea2ee56ac1b 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -316,8 +316,7 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,  		 int flags, int *addr_len);  void dccp_shutdown(struct sock *sk, int how);  int inet_dccp_listen(struct socket *sock, int backlog); -__poll_t dccp_poll(struct file *file, struct socket *sock, -		       poll_table *wait); +__poll_t dccp_poll_mask(struct socket *sock, __poll_t events);  int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);  void dccp_req_err(struct sock *sk, u64 seq); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b08feb219b44..a9e478cd3787 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -984,7 +984,7 @@ static const struct proto_ops inet_dccp_ops = {  	.accept		   = inet_accept,  	.getname	   = inet_getname,  	/* FIXME: work on tcp_poll to rename it to inet_csk_poll */ -	.poll		   = dccp_poll, +	.poll_mask	   = dccp_poll_mask,  	.ioctl		   = inet_ioctl,  	/* FIXME: work on inet_listen to rename it to sock_common_listen */  	.listen		   = inet_dccp_listen, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6344f1b18a6a..17fc4e0166ba 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1070,7 +1070,7 @@ static const struct proto_ops inet6_dccp_ops = {  	.socketpair	   = sock_no_socketpair,  	.accept		   = inet_accept,  	.getname	   = inet6_getname, -	.poll		   = dccp_poll, +	.poll_mask	   = dccp_poll_mask,  	.ioctl		   = inet6_ioctl,  	.listen		   = inet_dccp_listen,  	.shutdown	   = inet_shutdown, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0d56e36a6db7..ca21c1c76da0 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -312,20 +312,11 @@ int dccp_disconnect(struct sock *sk, int flags)  EXPORT_SYMBOL_GPL(dccp_disconnect); -/* - *	Wait for a DCCP event. - * - *	Note that we don't need to lock the socket, as the upper poll layers - *	take care of normal races (between the test and the event) and we don't - *	go look at any of the socket buffers directly. - */ -__poll_t dccp_poll(struct file *file, struct socket *sock, -		       poll_table *wait) +__poll_t dccp_poll_mask(struct socket *sock, __poll_t events)  {  	__poll_t mask;  	struct sock *sk = sock->sk; -	sock_poll_wait(file, sk_sleep(sk), wait);  	if (sk->sk_state == DCCP_LISTEN)  		return inet_csk_listen_poll(sk); @@ -367,7 +358,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,  	return mask;  } -EXPORT_SYMBOL_GPL(dccp_poll); +EXPORT_SYMBOL_GPL(dccp_poll_mask);  int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)  { diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 7d6ff983ba2c..9a686d890bfa 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer)  } -static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table  *wait) +static __poll_t dn_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk;  	struct dn_scp *scp = DN_SK(sk); -	__poll_t mask = datagram_poll(file, sock, wait); +	__poll_t mask = datagram_poll_mask(sock, events);  	if (!skb_queue_empty(&scp->other_receive_queue))  		mask |= EPOLLRDBAND; @@ -2331,7 +2331,7 @@ static const struct proto_ops dn_proto_ops = {  	.socketpair =	sock_no_socketpair,  	.accept =	dn_accept,  	.getname =	dn_getname, -	.poll =		dn_poll, +	.poll_mask =	dn_poll_mask,  	.ioctl =	dn_ioctl,  	.listen =	dn_listen,  	.shutdown =	dn_shutdown, diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a60658c85a9a..a0768d2759b8 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -423,7 +423,7 @@ static const struct proto_ops ieee802154_raw_ops = {  	.socketpair	   = sock_no_socketpair,  	.accept		   = sock_no_accept,  	.getname	   = sock_no_getname, -	.poll		   = datagram_poll, +	.poll_mask	   = datagram_poll_mask,  	.ioctl		   = ieee802154_sock_ioctl,  	.listen		   = sock_no_listen,  	.shutdown	   = sock_no_shutdown, @@ -969,7 +969,7 @@ static const struct proto_ops ieee802154_dgram_ops = {  	.socketpair	   = sock_no_socketpair,  	.accept		   = sock_no_accept,  	.getname	   = sock_no_getname, -	.poll		   = datagram_poll, +	.poll_mask	   = datagram_poll_mask,  	.ioctl		   = ieee802154_sock_ioctl,  	.listen		   = sock_no_listen,  	.shutdown	   = sock_no_shutdown, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eaed0367e669..8a59428e63ab 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = {  	.socketpair	   = sock_no_socketpair,  	.accept		   = inet_accept,  	.getname	   = inet_getname, -	.poll		   = tcp_poll, +	.poll_mask	   = tcp_poll_mask,  	.ioctl		   = inet_ioctl,  	.listen		   = inet_listen,  	.shutdown	   = inet_shutdown, @@ -1018,7 +1018,7 @@ const struct proto_ops inet_dgram_ops = {  	.socketpair	   = sock_no_socketpair,  	.accept		   = sock_no_accept,  	.getname	   = inet_getname, -	.poll		   = udp_poll, +	.poll_mask	   = udp_poll_mask,  	.ioctl		   = inet_ioctl,  	.listen		   = sock_no_listen,  	.shutdown	   = inet_shutdown, @@ -1039,7 +1039,7 @@ EXPORT_SYMBOL(inet_dgram_ops);  /*   * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without - * udp_poll + * udp_poll_mask   */  static const struct proto_ops inet_sockraw_ops = {  	.family		   = PF_INET, @@ -1050,7 +1050,7 @@ static const struct proto_ops inet_sockraw_ops = {  	.socketpair	   = sock_no_socketpair,  	.accept		   = sock_no_accept,  	.getname	   = inet_getname, -	.poll		   = datagram_poll, +	.poll_mask	   = datagram_poll_mask,  	.ioctl		   = inet_ioctl,  	.listen		   = sock_no_listen,  	.shutdown	   = inet_shutdown, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c9d00ef54dec..dec47e6789e7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -494,32 +494,21 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,  }  /* - *	Wait for a TCP event. - * - *	Note that we don't need to lock the socket, as the upper poll layers - *	take care of normal races (between the test and the event) and we don't - *	go look at any of the socket buffers directly. + * Socket is not locked. We are protected from async events by poll logic and + * correct handling of state changes made by other threads is impossible in + * any case.   */ -__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t tcp_poll_mask(struct socket *sock, __poll_t events)  { -	__poll_t mask;  	struct sock *sk = sock->sk;  	const struct tcp_sock *tp = tcp_sk(sk); +	__poll_t mask = 0;  	int state; -	sock_poll_wait(file, sk_sleep(sk), wait); -  	state = inet_sk_state_load(sk);  	if (state == TCP_LISTEN)  		return inet_csk_listen_poll(sk); -	/* Socket is not locked. We are protected from async events -	 * by poll logic and correct handling of state changes -	 * made by other threads is impossible in any case. -	 */ - -	mask = 0; -  	/*  	 * EPOLLHUP is certainly not done right. But poll() doesn't  	 * have a notion of HUP in just one direction, and for a @@ -600,7 +589,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)  	return mask;  } -EXPORT_SYMBOL(tcp_poll); +EXPORT_SYMBOL(tcp_poll_mask);  int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)  { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 051a43ff3fb8..675433eb53a8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2501,7 +2501,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,   * 	udp_poll - wait for a UDP event.   *	@file - file struct   *	@sock - socket - *	@wait - poll table + *	@events - events to wait for   *   *	This is same as datagram poll, except for the special case of   *	blocking sockets. If application is using a blocking fd @@ -2510,23 +2510,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,   *	but then block when reading it. Add special case code   *	to work around these arguably broken applications.   */ -__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t udp_poll_mask(struct socket *sock, __poll_t events)  { -	__poll_t mask = datagram_poll(file, sock, wait); +	__poll_t mask = datagram_poll_mask(sock, events);  	struct sock *sk = sock->sk;  	if (!skb_queue_empty(&udp_sk(sk)->reader_queue))  		mask |= EPOLLIN | EPOLLRDNORM;  	/* Check for false positives due to checksum errors */ -	if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) && +	if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) &&  	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)  		mask &= ~(EPOLLIN | EPOLLRDNORM);  	return mask;  } -EXPORT_SYMBOL(udp_poll); +EXPORT_SYMBOL(udp_poll_mask);  int udp_abort(struct sock *sk, int err)  { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8da0b513f188..d443c18b45fe 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -571,7 +571,7 @@ const struct proto_ops inet6_stream_ops = {  	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/  	.accept		   = inet_accept,		/* ok		*/  	.getname	   = inet6_getname, -	.poll		   = tcp_poll,			/* ok		*/ +	.poll_mask	   = tcp_poll_mask,		/* ok		*/  	.ioctl		   = inet6_ioctl,		/* must change  */  	.listen		   = inet_listen,		/* ok		*/  	.shutdown	   = inet_shutdown,		/* ok		*/ @@ -601,7 +601,7 @@ const struct proto_ops inet6_dgram_ops = {  	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/  	.accept		   = sock_no_accept,		/* a do nothing	*/  	.getname	   = inet6_getname, -	.poll		   = udp_poll,			/* ok		*/ +	.poll_mask	   = udp_poll_mask,		/* ok		*/  	.ioctl		   = inet6_ioctl,		/* must change  */  	.listen		   = sock_no_listen,		/* ok		*/  	.shutdown	   = inet_shutdown,		/* ok		*/ diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index afc307c89d1a..ce6f0d15b5dd 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1334,7 +1334,7 @@ void raw6_proc_exit(void)  }  #endif	/* CONFIG_PROC_FS */ -/* Same as inet6_dgram_ops, sans udp_poll.  */ +/* Same as inet6_dgram_ops, sans udp_poll_mask.  */  const struct proto_ops inet6_sockraw_ops = {  	.family		   = PF_INET6,  	.owner		   = THIS_MODULE, @@ -1344,7 +1344,7 @@ const struct proto_ops inet6_sockraw_ops = {  	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/  	.accept		   = sock_no_accept,		/* a do nothing	*/  	.getname	   = inet6_getname, -	.poll		   = datagram_poll,		/* ok		*/ +	.poll_mask	   = datagram_poll_mask,	/* ok		*/  	.ioctl		   = inet6_ioctl,		/* must change  */  	.listen		   = sock_no_listen,		/* ok		*/  	.shutdown	   = inet_shutdown,		/* ok		*/ diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 893a022f9620..68e86257a549 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1488,14 +1488,11 @@ static inline __poll_t iucv_accept_poll(struct sock *parent)  	return 0;  } -__poll_t iucv_sock_poll(struct file *file, struct socket *sock, -			    poll_table *wait) +static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk;  	__poll_t mask = 0; -	sock_poll_wait(file, sk_sleep(sk), wait); -  	if (sk->sk_state == IUCV_LISTEN)  		return iucv_accept_poll(sk); @@ -2388,7 +2385,7 @@ static const struct proto_ops iucv_sock_ops = {  	.getname	= iucv_sock_getname,  	.sendmsg	= iucv_sock_sendmsg,  	.recvmsg	= iucv_sock_recvmsg, -	.poll		= iucv_sock_poll, +	.poll_mask	= iucv_sock_poll_mask,  	.ioctl		= sock_no_ioctl,  	.mmap		= sock_no_mmap,  	.socketpair	= sock_no_socketpair, diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index d3601d421571..84b7d5c6fec8 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)  	struct list_head *head;  	int index = 0; -	/* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so -	 * we set sk_state, otherwise epoll_wait always returns right away with -	 * EPOLLHUP +	/* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state, +	 * so  we set sk_state, otherwise epoll_wait always returns right away +	 * with EPOLLHUP  	 */  	kcm->sk.sk_state = TCP_ESTABLISHED; @@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = {  	.socketpair =	sock_no_socketpair,  	.accept =	sock_no_accept,  	.getname =	sock_no_getname, -	.poll =		datagram_poll, +	.poll_mask =	datagram_poll_mask,  	.ioctl =	kcm_ioctl,  	.listen =	sock_no_listen,  	.shutdown =	sock_no_shutdown, @@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = {  	.socketpair =	sock_no_socketpair,  	.accept =	sock_no_accept,  	.getname =	sock_no_getname, -	.poll =		datagram_poll, +	.poll_mask =	datagram_poll_mask,  	.ioctl =	kcm_ioctl,  	.listen =	sock_no_listen,  	.shutdown =	sock_no_shutdown, diff --git a/net/key/af_key.c b/net/key/af_key.c index 5e1d2946ffbf..8bdc1cbe490a 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3751,7 +3751,7 @@ static const struct proto_ops pfkey_ops = {  	/* Now the operations that really occur. */  	.release	=	pfkey_release, -	.poll		=	datagram_poll, +	.poll_mask	=	datagram_poll_mask,  	.sendmsg	=	pfkey_sendmsg,  	.recvmsg	=	pfkey_recvmsg,  }; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index a9c05b2bc1b0..181073bf6925 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = {  	.socketpair	   = sock_no_socketpair,  	.accept		   = sock_no_accept,  	.getname	   = l2tp_ip_getname, -	.poll		   = datagram_poll, +	.poll_mask	   = datagram_poll_mask,  	.ioctl		   = inet_ioctl,  	.listen		   = sock_no_listen,  	.shutdown	   = inet_shutdown, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 957369192ca1..336e4c00abbc 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -754,7 +754,7 @@ static const struct proto_ops l2tp_ip6_ops = {  	.socketpair	   = sock_no_socketpair,  	.accept		   = sock_no_accept,  	.getname	   = l2tp_ip6_getname, -	.poll		   = datagram_poll, +	.poll_mask	   = datagram_poll_mask,  	.ioctl		   = inet6_ioctl,  	.listen		   = sock_no_listen,  	.shutdown	   = inet_shutdown, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 830469766c1f..3d8ca1231f8f 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1788,7 +1788,7 @@ static const struct proto_ops pppol2tp_ops = {  	.socketpair	= sock_no_socketpair,  	.accept		= sock_no_accept,  	.getname	= pppol2tp_getname, -	.poll		= datagram_poll, +	.poll_mask	= datagram_poll_mask,  	.listen		= sock_no_listen,  	.shutdown	= sock_no_shutdown,  	.setsockopt	= pppol2tp_setsockopt, diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 1beeea9549fa..804de8490186 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1192,7 +1192,7 @@ static const struct proto_ops llc_ui_ops = {  	.socketpair  = sock_no_socketpair,  	.accept      = llc_ui_accept,  	.getname     = llc_ui_getname, -	.poll	     = datagram_poll, +	.poll_mask   = datagram_poll_mask,  	.ioctl       = llc_ui_ioctl,  	.listen      = llc_ui_listen,  	.shutdown    = llc_ui_shutdown, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 393573a99a5a..1189b84413d5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2658,7 +2658,7 @@ static const struct proto_ops netlink_ops = {  	.socketpair =	sock_no_socketpair,  	.accept =	sock_no_accept,  	.getname =	netlink_getname, -	.poll =		datagram_poll, +	.poll_mask =	datagram_poll_mask,  	.ioctl =	netlink_ioctl,  	.listen =	sock_no_listen,  	.shutdown =	sock_no_shutdown, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index c2888c78d4c1..b97eb766a1d5 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1355,7 +1355,7 @@ static const struct proto_ops nr_proto_ops = {  	.socketpair	=	sock_no_socketpair,  	.accept		=	nr_accept,  	.getname	=	nr_getname, -	.poll		=	datagram_poll, +	.poll_mask	=	datagram_poll_mask,  	.ioctl		=	nr_ioctl,  	.listen		=	nr_listen,  	.shutdown	=	sock_no_shutdown, diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ea0c0c6f1874..ab5bb14b49af 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -548,16 +548,13 @@ static inline __poll_t llcp_accept_poll(struct sock *parent)  	return 0;  } -static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, -				   poll_table *wait) +static __poll_t llcp_sock_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk;  	__poll_t mask = 0;  	pr_debug("%p\n", sk); -	sock_poll_wait(file, sk_sleep(sk), wait); -  	if (sk->sk_state == LLCP_LISTEN)  		return llcp_accept_poll(sk); @@ -899,7 +896,7 @@ static const struct proto_ops llcp_sock_ops = {  	.socketpair     = sock_no_socketpair,  	.accept         = llcp_sock_accept,  	.getname        = llcp_sock_getname, -	.poll           = llcp_sock_poll, +	.poll_mask      = llcp_sock_poll_mask,  	.ioctl          = sock_no_ioctl,  	.listen         = llcp_sock_listen,  	.shutdown       = sock_no_shutdown, @@ -919,7 +916,7 @@ static const struct proto_ops llcp_rawsock_ops = {  	.socketpair     = sock_no_socketpair,  	.accept         = sock_no_accept,  	.getname        = llcp_sock_getname, -	.poll           = llcp_sock_poll, +	.poll_mask      = llcp_sock_poll_mask,  	.ioctl          = sock_no_ioctl,  	.listen         = sock_no_listen,  	.shutdown       = sock_no_shutdown, diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index e2188deb08dc..60c322531c49 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = {  	.socketpair     = sock_no_socketpair,  	.accept         = sock_no_accept,  	.getname        = sock_no_getname, -	.poll           = datagram_poll, +	.poll_mask      = datagram_poll_mask,  	.ioctl          = sock_no_ioctl,  	.listen         = sock_no_listen,  	.shutdown       = sock_no_shutdown, @@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = {  	.socketpair     = sock_no_socketpair,  	.accept         = sock_no_accept,  	.getname        = sock_no_getname, -	.poll           = datagram_poll, +	.poll_mask      = datagram_poll_mask,  	.ioctl          = sock_no_ioctl,  	.listen         = sock_no_listen,  	.shutdown       = sock_no_shutdown, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f9cdd27a7f6f..674390b1f084 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4110,12 +4110,11 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,  	return 0;  } -static __poll_t packet_poll(struct file *file, struct socket *sock, -				poll_table *wait) +static __poll_t packet_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk;  	struct packet_sock *po = pkt_sk(sk); -	__poll_t mask = datagram_poll(file, sock, wait); +	__poll_t mask = datagram_poll_mask(sock, events);  	spin_lock_bh(&sk->sk_receive_queue.lock);  	if (po->rx_ring.pg_vec) { @@ -4457,7 +4456,7 @@ static const struct proto_ops packet_ops_spkt = {  	.socketpair =	sock_no_socketpair,  	.accept =	sock_no_accept,  	.getname =	packet_getname_spkt, -	.poll =		datagram_poll, +	.poll_mask =	datagram_poll_mask,  	.ioctl =	packet_ioctl,  	.listen =	sock_no_listen,  	.shutdown =	sock_no_shutdown, @@ -4478,7 +4477,7 @@ static const struct proto_ops packet_ops = {  	.socketpair =	sock_no_socketpair,  	.accept =	sock_no_accept,  	.getname =	packet_getname, -	.poll =		packet_poll, +	.poll_mask =	packet_poll_mask,  	.ioctl =	packet_ioctl,  	.listen =	sock_no_listen,  	.shutdown =	sock_no_shutdown, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 30187990257f..c295c4e20f01 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -340,15 +340,12 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,  	return sizeof(struct sockaddr_pn);  } -static __poll_t pn_socket_poll(struct file *file, struct socket *sock, -					poll_table *wait) +static __poll_t pn_socket_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk;  	struct pep_sock *pn = pep_sk(sk);  	__poll_t mask = 0; -	poll_wait(file, sk_sleep(sk), wait); -  	if (sk->sk_state == TCP_CLOSE)  		return EPOLLERR;  	if (!skb_queue_empty(&sk->sk_receive_queue)) @@ -448,7 +445,7 @@ const struct proto_ops phonet_dgram_ops = {  	.socketpair	= sock_no_socketpair,  	.accept		= sock_no_accept,  	.getname	= pn_socket_getname, -	.poll		= datagram_poll, +	.poll_mask	= datagram_poll_mask,  	.ioctl		= pn_socket_ioctl,  	.listen		= sock_no_listen,  	.shutdown	= sock_no_shutdown, @@ -473,7 +470,7 @@ const struct proto_ops phonet_stream_ops = {  	.socketpair	= sock_no_socketpair,  	.accept		= pn_socket_accept,  	.getname	= pn_socket_getname, -	.poll		= pn_socket_poll, +	.poll_mask	= pn_socket_poll_mask,  	.ioctl		= pn_socket_ioctl,  	.listen		= pn_socket_listen,  	.shutdown	= sock_no_shutdown, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 2aa07b547b16..1b5025ea5b04 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1023,7 +1023,7 @@ static const struct proto_ops qrtr_proto_ops = {  	.recvmsg	= qrtr_recvmsg,  	.getname	= qrtr_getname,  	.ioctl		= qrtr_ioctl, -	.poll		= datagram_poll, +	.poll_mask	= datagram_poll_mask,  	.shutdown	= sock_no_shutdown,  	.setsockopt	= sock_no_setsockopt,  	.getsockopt	= sock_no_getsockopt, diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 22a7f2b413ac..5b73fea849df 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1470,7 +1470,7 @@ static const struct proto_ops rose_proto_ops = {  	.socketpair	=	sock_no_socketpair,  	.accept		=	rose_accept,  	.getname	=	rose_getname, -	.poll		=	datagram_poll, +	.poll_mask	=	datagram_poll_mask,  	.ioctl		=	rose_ioctl,  	.listen		=	rose_listen,  	.shutdown	=	sock_no_shutdown, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2b463047dd7b..3b1ac93efee2 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -734,15 +734,11 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname,  /*   * permit an RxRPC socket to be polled   */ -static __poll_t rxrpc_poll(struct file *file, struct socket *sock, -			       poll_table *wait) +static __poll_t rxrpc_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk;  	struct rxrpc_sock *rx = rxrpc_sk(sk); -	__poll_t mask; - -	sock_poll_wait(file, sk_sleep(sk), wait); -	mask = 0; +	__poll_t mask = 0;  	/* the socket is readable if there are any messages waiting on the Rx  	 * queue */ @@ -949,7 +945,7 @@ static const struct proto_ops rxrpc_rpc_ops = {  	.socketpair	= sock_no_socketpair,  	.accept		= sock_no_accept,  	.getname	= sock_no_getname, -	.poll		= rxrpc_poll, +	.poll_mask	= rxrpc_poll_mask,  	.ioctl		= sock_no_ioctl,  	.listen		= rxrpc_listen,  	.shutdown	= rxrpc_shutdown, diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0cd2e764f47f..7339918a805d 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1010,7 +1010,7 @@ static const struct proto_ops inet6_seqpacket_ops = {  	.socketpair	   = sock_no_socketpair,  	.accept		   = inet_accept,  	.getname	   = sctp_getname, -	.poll		   = sctp_poll, +	.poll_mask	   = sctp_poll_mask,  	.ioctl		   = inet6_ioctl,  	.listen		   = sctp_inet_listen,  	.shutdown	   = inet_shutdown, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6bf0a9971888..11d93377ba5e 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1016,7 +1016,7 @@ static const struct proto_ops inet_seqpacket_ops = {  	.socketpair	   = sock_no_socketpair,  	.accept		   = inet_accept,  	.getname	   = inet_getname,	/* Semantics are different.  */ -	.poll		   = sctp_poll, +	.poll_mask	   = sctp_poll_mask,  	.ioctl		   = inet_ioctl,  	.listen		   = sctp_inet_listen,  	.shutdown	   = inet_shutdown,	/* Looks harmless.  */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ae7e7c606f72..bf747094d26b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7722,14 +7722,12 @@ out:   * here, again, by modeling the current TCP/UDP code.  We don't have   * a good way to test with it yet.   */ -__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t sctp_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk;  	struct sctp_sock *sp = sctp_sk(sk);  	__poll_t mask; -	poll_wait(file, sk_sleep(sk), wait); -  	sock_rps_record_flow(sk);  	/* A TCP-style listening socket becomes readable when the accept queue diff --git a/net/socket.c b/net/socket.c index f10f1d947c78..2d752e9eb3f9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -117,8 +117,10 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);  static int sock_mmap(struct file *file, struct vm_area_struct *vma);  static int sock_close(struct inode *inode, struct file *file); -static __poll_t sock_poll(struct file *file, -			      struct poll_table_struct *wait); +static struct wait_queue_head *sock_get_poll_head(struct file *file, +		__poll_t events); +static __poll_t sock_poll_mask(struct file *file, __poll_t); +static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait);  static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);  #ifdef CONFIG_COMPAT  static long compat_sock_ioctl(struct file *file, @@ -141,6 +143,8 @@ static const struct file_operations socket_file_ops = {  	.llseek =	no_llseek,  	.read_iter =	sock_read_iter,  	.write_iter =	sock_write_iter, +	.get_poll_head = sock_get_poll_head, +	.poll_mask =	sock_poll_mask,  	.poll =		sock_poll,  	.unlocked_ioctl = sock_ioctl,  #ifdef CONFIG_COMPAT @@ -1114,27 +1118,48 @@ out_release:  }  EXPORT_SYMBOL(sock_create_lite); -/* No kernel lock held - perfect */ -static __poll_t sock_poll(struct file *file, poll_table *wait) +static struct wait_queue_head *sock_get_poll_head(struct file *file, +		__poll_t events)  { -	__poll_t busy_flag = 0; -	struct socket *sock; +	struct socket *sock = file->private_data; + +	if (!sock->ops->poll_mask) +		return NULL; +	sock_poll_busy_loop(sock, events); +	return sk_sleep(sock->sk); +} + +static __poll_t sock_poll_mask(struct file *file, __poll_t events) +{ +	struct socket *sock = file->private_data;  	/* -	 *      We can't return errors to poll, so it's either yes or no. +	 * We need to be sure we are in sync with the socket flags modification. +	 * +	 * This memory barrier is paired in the wq_has_sleeper.  	 */ -	sock = file->private_data; +	smp_mb(); + +	/* this socket can poll_ll so tell the system call */ +	return sock->ops->poll_mask(sock, events) | +		(sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0); +} -	if (sk_can_busy_loop(sock->sk)) { -		/* this socket can poll_ll so tell the system call */ -		busy_flag = POLL_BUSY_LOOP; +/* No kernel lock held - perfect */ +static __poll_t sock_poll(struct file *file, poll_table *wait) +{ +	struct socket *sock = file->private_data; +	__poll_t events = poll_requested_events(wait), mask = 0; -		/* once, only if requested by syscall */ -		if (wait && (wait->_key & POLL_BUSY_LOOP)) -			sk_busy_loop(sock->sk, 1); +	if (sock->ops->poll) { +		sock_poll_busy_loop(sock, events); +		mask = sock->ops->poll(file, sock, wait); +	} else if (sock->ops->poll_mask) { +		sock_poll_wait(file, sock_get_poll_head(file, events), wait); +		mask = sock->ops->poll_mask(sock, events);  	} -	return busy_flag | sock->ops->poll(file, sock, wait); +	return mask | sock_poll_busy_flag(sock);  }  static int sock_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 6be21575503a..3bb45042e833 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -692,10 +692,9 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,  }  /** - * tipc_poll - read and possibly block on pollmask + * tipc_poll - read pollmask   * @file: file structure associated with the socket   * @sock: socket for which to calculate the poll bits - * @wait: ???   *   * Returns pollmask value   * @@ -709,15 +708,12 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,   * imply that the operation will succeed, merely that it should be performed   * and will not block.   */ -static __poll_t tipc_poll(struct file *file, struct socket *sock, -			      poll_table *wait) +static __poll_t tipc_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk;  	struct tipc_sock *tsk = tipc_sk(sk);  	__poll_t revents = 0; -	sock_poll_wait(file, sk_sleep(sk), wait); -  	if (sk->sk_shutdown & RCV_SHUTDOWN)  		revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;  	if (sk->sk_shutdown == SHUTDOWN_MASK) @@ -3028,7 +3024,7 @@ static const struct proto_ops msg_ops = {  	.socketpair	= tipc_socketpair,  	.accept		= sock_no_accept,  	.getname	= tipc_getname, -	.poll		= tipc_poll, +	.poll_mask	= tipc_poll_mask,  	.ioctl		= tipc_ioctl,  	.listen		= sock_no_listen,  	.shutdown	= tipc_shutdown, @@ -3049,7 +3045,7 @@ static const struct proto_ops packet_ops = {  	.socketpair	= tipc_socketpair,  	.accept		= tipc_accept,  	.getname	= tipc_getname, -	.poll		= tipc_poll, +	.poll_mask	= tipc_poll_mask,  	.ioctl		= tipc_ioctl,  	.listen		= tipc_listen,  	.shutdown	= tipc_shutdown, @@ -3070,7 +3066,7 @@ static const struct proto_ops stream_ops = {  	.socketpair	= tipc_socketpair,  	.accept		= tipc_accept,  	.getname	= tipc_getname, -	.poll		= tipc_poll, +	.poll_mask	= tipc_poll_mask,  	.ioctl		= tipc_ioctl,  	.listen		= tipc_listen,  	.shutdown	= tipc_shutdown, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e5473c03d667..95b02a71fd47 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -638,9 +638,8 @@ static int unix_stream_connect(struct socket *, struct sockaddr *,  static int unix_socketpair(struct socket *, struct socket *);  static int unix_accept(struct socket *, struct socket *, int, bool);  static int unix_getname(struct socket *, struct sockaddr *, int); -static __poll_t unix_poll(struct file *, struct socket *, poll_table *); -static __poll_t unix_dgram_poll(struct file *, struct socket *, -				    poll_table *); +static __poll_t unix_poll_mask(struct socket *, __poll_t); +static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t);  static int unix_ioctl(struct socket *, unsigned int, unsigned long);  static int unix_shutdown(struct socket *, int);  static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); @@ -681,7 +680,7 @@ static const struct proto_ops unix_stream_ops = {  	.socketpair =	unix_socketpair,  	.accept =	unix_accept,  	.getname =	unix_getname, -	.poll =		unix_poll, +	.poll_mask =	unix_poll_mask,  	.ioctl =	unix_ioctl,  	.listen =	unix_listen,  	.shutdown =	unix_shutdown, @@ -704,7 +703,7 @@ static const struct proto_ops unix_dgram_ops = {  	.socketpair =	unix_socketpair,  	.accept =	sock_no_accept,  	.getname =	unix_getname, -	.poll =		unix_dgram_poll, +	.poll_mask =	unix_dgram_poll_mask,  	.ioctl =	unix_ioctl,  	.listen =	sock_no_listen,  	.shutdown =	unix_shutdown, @@ -726,7 +725,7 @@ static const struct proto_ops unix_seqpacket_ops = {  	.socketpair =	unix_socketpair,  	.accept =	unix_accept,  	.getname =	unix_getname, -	.poll =		unix_dgram_poll, +	.poll_mask =	unix_dgram_poll_mask,  	.ioctl =	unix_ioctl,  	.listen =	unix_listen,  	.shutdown =	unix_shutdown, @@ -2630,13 +2629,10 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)  	return err;  } -static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) +static __poll_t unix_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk; -	__poll_t mask; - -	sock_poll_wait(file, sk_sleep(sk), wait); -	mask = 0; +	__poll_t mask = 0;  	/* exceptional events? */  	if (sk->sk_err) @@ -2665,15 +2661,11 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa  	return mask;  } -static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, -				    poll_table *wait) +static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk, *other; -	unsigned int writable; -	__poll_t mask; - -	sock_poll_wait(file, sk_sleep(sk), wait); -	mask = 0; +	int writable; +	__poll_t mask = 0;  	/* exceptional events? */  	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -2699,7 +2691,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,  	}  	/* No write status requested, avoid expensive OUT tests. */ -	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) +	if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))  		return mask;  	writable = unix_writable(sk); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index c1076c19b858..bb5d5fa68c35 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -850,18 +850,11 @@ static int vsock_shutdown(struct socket *sock, int mode)  	return err;  } -static __poll_t vsock_poll(struct file *file, struct socket *sock, -			       poll_table *wait) +static __poll_t vsock_poll_mask(struct socket *sock, __poll_t events)  { -	struct sock *sk; -	__poll_t mask; -	struct vsock_sock *vsk; - -	sk = sock->sk; -	vsk = vsock_sk(sk); - -	poll_wait(file, sk_sleep(sk), wait); -	mask = 0; +	struct sock *sk = sock->sk; +	struct vsock_sock *vsk = vsock_sk(sk); +	__poll_t mask = 0;  	if (sk->sk_err)  		/* Signify that there has been an error on this socket. */ @@ -1091,7 +1084,7 @@ static const struct proto_ops vsock_dgram_ops = {  	.socketpair = sock_no_socketpair,  	.accept = sock_no_accept,  	.getname = vsock_getname, -	.poll = vsock_poll, +	.poll_mask = vsock_poll_mask,  	.ioctl = sock_no_ioctl,  	.listen = sock_no_listen,  	.shutdown = vsock_shutdown, @@ -1849,7 +1842,7 @@ static const struct proto_ops vsock_stream_ops = {  	.socketpair = sock_no_socketpair,  	.accept = vsock_accept,  	.getname = vsock_getname, -	.poll = vsock_poll, +	.poll_mask = vsock_poll_mask,  	.ioctl = sock_no_ioctl,  	.listen = vsock_listen,  	.shutdown = vsock_shutdown, diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index d49aa79b7997..f93365ae0fdd 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = {  	.socketpair =	sock_no_socketpair,  	.accept =	x25_accept,  	.getname =	x25_getname, -	.poll =		datagram_poll, +	.poll_mask =	datagram_poll_mask,  	.ioctl =	x25_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl = compat_x25_ioctl, diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 6e865e8b5b10..90d30fbe95ae 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -397,7 +397,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)  	 * Check if there was an event already pending on the eventfd  	 * before we registered, and trigger it as if we didn't miss it.  	 */ -	events = f.file->f_op->poll(f.file, &irqfd->pt); +	events = vfs_poll(f.file, &irqfd->pt);  	if (events & EPOLLIN)  		schedule_work(&irqfd->inject); | 
