diff options
Diffstat (limited to 'fs/io_uring.c')
| -rw-r--r-- | fs/io_uring.c | 4183 | 
1 files changed, 2686 insertions, 1497 deletions
| diff --git a/fs/io_uring.c b/fs/io_uring.c index a8413f006417..9f1c682d7caf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -63,7 +63,6 @@  #include <net/sock.h>  #include <net/af_unix.h>  #include <net/scm.h> -#include <net/busy_poll.h>  #include <linux/anon_inodes.h>  #include <linux/sched/mm.h>  #include <linux/uaccess.h> @@ -81,6 +80,7 @@  #include <linux/io_uring.h>  #include <linux/audit.h>  #include <linux/security.h> +#include <linux/xattr.h>  #define CREATE_TRACE_POINTS  #include <trace/events/io_uring.h> @@ -95,7 +95,7 @@  #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8  /* only define max */ -#define IORING_MAX_FIXED_FILES	(1U << 15) +#define IORING_MAX_FIXED_FILES	(1U << 20)  #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \  				 IORING_REGISTER_LAST + IORING_OP_LAST) @@ -112,8 +112,12 @@  			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)  #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ -				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ -				REQ_F_ASYNC_DATA) +				REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA) + +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ +				 IO_REQ_CLEAN_FLAGS) + +#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)  #define IO_TCTX_REFS_CACHE_NR	(1U << 10) @@ -168,7 +172,7 @@ struct io_rings {  	 * The application needs a full memory barrier before checking  	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.  	 */ -	u32			sq_flags; +	atomic_t		sq_flags;  	/*  	 * Runtime CQ flags  	 * @@ -200,13 +204,6 @@ struct io_rings {  	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;  }; -enum io_uring_cmd_flags { -	IO_URING_F_COMPLETE_DEFER	= 1, -	IO_URING_F_UNLOCKED		= 2, -	/* int's last bit, sign checks are usually faster than a bit test */ -	IO_URING_F_NONBLOCK		= INT_MIN, -}; -  struct io_mapped_ubuf {  	u64		ubuf;  	u64		ubuf_end; @@ -218,10 +215,27 @@ struct io_mapped_ubuf {  struct io_ring_ctx;  struct io_overflow_cqe { -	struct io_uring_cqe cqe;  	struct list_head list; +	struct io_uring_cqe cqe;  }; +/* + * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 + * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we + * can't safely always dereference the file when the task has exited and ring + * cleanup is done. If a file is tracked and part of SCM, then unix gc on + * process exit may reap it before __io_sqe_files_unregister() is run. + */ +#define FFS_NOWAIT		0x1UL +#define FFS_ISREG		0x2UL +#if defined(CONFIG_64BIT) +#define FFS_SCM			0x4UL +#else +#define IO_URING_SCM_ALL +#define FFS_SCM			0x0UL +#endif +#define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) +  struct io_fixed_file {  	/* file * with additional FFS_* flags */  	unsigned long file_ptr; @@ -239,6 +253,8 @@ struct io_rsrc_put {  struct io_file_table {  	struct io_fixed_file *files; +	unsigned long *bitmap; +	unsigned int alloc_hint;  };  struct io_rsrc_node { @@ -263,10 +279,26 @@ struct io_rsrc_data {  	bool				quiesce;  }; +#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))  struct io_buffer_list { -	struct list_head list; -	struct list_head buf_list; +	/* +	 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, +	 * then these are classic provided buffers and ->buf_list is used. +	 */ +	union { +		struct list_head buf_list; +		struct { +			struct page **buf_pages; +			struct io_uring_buf_ring *buf_ring; +		}; +	};  	__u16 bgid; + +	/* below is for ring provided buffers */ +	__u16 buf_nr_pages; +	__u16 nr_entries; +	__u32 head; +	__u32 mask;  };  struct io_buffer { @@ -339,7 +371,7 @@ struct io_ev_fd {  	struct rcu_head		rcu;  }; -#define IO_BUFFERS_HASH_BITS	5 +#define BGID_ARRAY	64  struct io_ring_ctx {  	/* const or read-mostly hot data */ @@ -348,6 +380,7 @@ struct io_ring_ctx {  		struct io_rings		*rings;  		unsigned int		flags; +		enum task_work_notify_mode	notify_method;  		unsigned int		compat: 1;  		unsigned int		drain_next: 1;  		unsigned int		restricted: 1; @@ -355,6 +388,7 @@ struct io_ring_ctx {  		unsigned int		drain_active: 1;  		unsigned int		drain_disabled: 1;  		unsigned int		has_evfd: 1; +		unsigned int		syscall_iopoll: 1;  	} ____cacheline_aligned_in_smp;  	/* submission data */ @@ -384,17 +418,21 @@ struct io_ring_ctx {  		 */  		struct io_rsrc_node	*rsrc_node;  		int			rsrc_cached_refs; +		atomic_t		cancel_seq;  		struct io_file_table	file_table;  		unsigned		nr_user_files;  		unsigned		nr_user_bufs;  		struct io_mapped_ubuf	**user_bufs;  		struct io_submit_state	submit_state; + +		struct io_buffer_list	*io_bl; +		struct xarray		io_bl_xa; +		struct list_head	io_buffers_cache; +  		struct list_head	timeout_list;  		struct list_head	ltimeout_list;  		struct list_head	cq_overflow_list; -		struct list_head	*io_buffers; -		struct list_head	io_buffers_cache;  		struct list_head	apoll_cache;  		struct xarray		personalities;  		u32			pers_next; @@ -411,14 +449,16 @@ struct io_ring_ctx {  	struct wait_queue_head	sqo_sq_wait;  	struct list_head	sqd_list; -	unsigned long		check_cq_overflow; -#ifdef CONFIG_NET_RX_BUSY_POLL -	/* used to track busy poll napi_id */ -	struct list_head	napi_list; -	spinlock_t		napi_lock;	/* napi_list lock */ -#endif +	unsigned long		check_cq;  	struct { +		/* +		 * We cache a range of free CQEs we can use, once exhausted it +		 * should go through a slower range setup, see __io_get_cqe() +		 */ +		struct io_uring_cqe	*cqe_cached; +		struct io_uring_cqe	*cqe_sentinel; +  		unsigned		cached_cq_tail;  		unsigned		cq_entries;  		struct io_ev_fd	__rcu	*io_ev_fd; @@ -500,12 +540,11 @@ struct io_uring_task {  	const struct io_ring_ctx *last;  	struct io_wq		*io_wq;  	struct percpu_counter	inflight; -	atomic_t		inflight_tracked;  	atomic_t		in_idle;  	spinlock_t		task_lock;  	struct io_wq_work_list	task_list; -	struct io_wq_work_list	prior_task_list; +	struct io_wq_work_list	prio_task_list;  	struct callback_head	task_work;  	struct file		**registered_rings;  	bool			task_running; @@ -554,6 +593,16 @@ struct io_accept {  	unsigned long			nofile;  }; +struct io_socket { +	struct file			*file; +	int				domain; +	int				type; +	int				protocol; +	int				flags; +	u32				file_slot; +	unsigned long			nofile; +}; +  struct io_sync {  	struct file			*file;  	loff_t				len; @@ -565,6 +614,8 @@ struct io_sync {  struct io_cancel {  	struct file			*file;  	u64				addr; +	u32				flags; +	s32				fd;  };  struct io_timeout { @@ -592,7 +643,8 @@ struct io_rw {  	/* NOTE: kiocb has the file as the first member, so don't do it here */  	struct kiocb			kiocb;  	u64				addr; -	u64				len; +	u32				len; +	rwf_t				flags;  };  struct io_connect { @@ -609,9 +661,9 @@ struct io_sr_msg {  		void __user			*buf;  	};  	int				msg_flags; -	int				bgid;  	size_t				len;  	size_t				done_io; +	unsigned int			flags;  };  struct io_open { @@ -654,10 +706,10 @@ struct io_epoll {  struct io_splice {  	struct file			*file_out; -	struct file			*file_in;  	loff_t				off_out;  	loff_t				off_in;  	u64				len; +	int				splice_fd_in;  	unsigned int			flags;  }; @@ -729,6 +781,12 @@ struct io_msg {  	u32 len;  }; +struct io_nop { +	struct file			*file; +	u64				extra1; +	u64				extra2; +}; +  struct io_async_connect {  	struct sockaddr_storage		address;  }; @@ -755,6 +813,12 @@ struct io_async_rw {  	struct wait_page_queue		wpq;  }; +struct io_xattr { +	struct file			*file; +	struct xattr_ctx		ctx; +	struct filename			*filename; +}; +  enum {  	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,  	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT, @@ -773,6 +837,7 @@ enum {  	REQ_F_NEED_CLEANUP_BIT,  	REQ_F_POLLED_BIT,  	REQ_F_BUFFER_SELECTED_BIT, +	REQ_F_BUFFER_RING_BIT,  	REQ_F_COMPLETE_INLINE_BIT,  	REQ_F_REISSUE_BIT,  	REQ_F_CREDS_BIT, @@ -783,6 +848,7 @@ enum {  	REQ_F_SINGLE_POLL_BIT,  	REQ_F_DOUBLE_POLL_BIT,  	REQ_F_PARTIAL_IO_BIT, +	REQ_F_APOLL_MULTISHOT_BIT,  	/* keep async read/write and isreg together and in order */  	REQ_F_SUPPORT_NOWAIT_BIT,  	REQ_F_ISREG_BIT, @@ -823,6 +889,8 @@ enum {  	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),  	/* buffer already selected */  	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT), +	/* buffer selected from ring, needs commit */ +	REQ_F_BUFFER_RING	= BIT(REQ_F_BUFFER_RING_BIT),  	/* completion is deferred through io_comp_state */  	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),  	/* caller should reissue async */ @@ -847,6 +915,8 @@ enum {  	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),  	/* request has already done partial IO */  	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT), +	/* fast poll multishot mode */ +	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT),  };  struct async_poll { @@ -869,6 +939,21 @@ enum {  	IORING_RSRC_BUFFER		= 1,  }; +struct io_cqe { +	__u64	user_data; +	__s32	res; +	/* fd initially, then cflags for completion */ +	union { +		__u32	flags; +		int	fd; +	}; +}; + +enum { +	IO_CHECK_CQ_OVERFLOW_BIT, +	IO_CHECK_CQ_DROPPED_BIT, +}; +  /*   * NOTE! Each of the iocb union members has the file pointer   * as the first entry in their struct definition. So you can @@ -904,38 +989,65 @@ struct io_kiocb {  		struct io_symlink	symlink;  		struct io_hardlink	hardlink;  		struct io_msg		msg; +		struct io_xattr		xattr; +		struct io_socket	sock; +		struct io_nop		nop; +		struct io_uring_cmd	uring_cmd;  	};  	u8				opcode;  	/* polled IO has completed */  	u8				iopoll_completed; +	/* +	 * Can be either a fixed buffer index, or used with provided buffers. +	 * For the latter, before issue it points to the buffer group ID, +	 * and after selection it points to the buffer ID itself. +	 */  	u16				buf_index;  	unsigned int			flags; -	u64				user_data; -	u32				result; -	u32				cflags; +	struct io_cqe			cqe;  	struct io_ring_ctx		*ctx;  	struct task_struct		*task; -	struct percpu_ref		*fixed_rsrc_refs; -	/* store used ubuf, so we can prevent reloading */ -	struct io_mapped_ubuf		*imu; +	struct io_rsrc_node		*rsrc_node; + +	union { +		/* store used ubuf, so we can prevent reloading */ +		struct io_mapped_ubuf	*imu; + +		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ +		struct io_buffer	*kbuf; -	/* used by request caches, completion batching and iopoll */ -	struct io_wq_work_node		comp_list; +		/* +		 * stores buffer ID for ring provided buffers, valid IFF +		 * REQ_F_BUFFER_RING is set. +		 */ +		struct io_buffer_list	*buf_list; +	}; + +	union { +		/* used by request caches, completion batching and iopoll */ +		struct io_wq_work_node	comp_list; +		/* cache ->apoll->events */ +		__poll_t apoll_events; +	};  	atomic_t			refs;  	atomic_t			poll_refs;  	struct io_task_work		io_task_work;  	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ -	struct hlist_node		hash_node; +	union { +		struct hlist_node	hash_node; +		struct { +			u64		extra1; +			u64		extra2; +		}; +	};  	/* internal polling, see IORING_FEAT_FAST_POLL */  	struct async_poll		*apoll;  	/* opcode allocated if it needs to store data for async defer */  	void				*async_data; -	/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ -	struct io_buffer		*kbuf;  	/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */  	struct io_kiocb			*link;  	/* custom credentials, valid IFF REQ_F_CREDS is set */ @@ -955,6 +1067,24 @@ struct io_defer_entry {  	u32			seq;  }; +struct io_cancel_data { +	struct io_ring_ctx *ctx; +	union { +		u64 data; +		struct file *file; +	}; +	u32 flags; +	int seq; +}; + +/* + * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into + * the following sqe if SQE128 is used. + */ +#define uring_cmd_pdu_size(is_sqe128)				\ +	((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) -	\ +		offsetof(struct io_uring_sqe, cmd)) +  struct io_op_def {  	/* needs req->file assigned */  	unsigned		needs_file : 1; @@ -976,12 +1106,20 @@ struct io_op_def {  	unsigned		not_supported : 1;  	/* skip auditing */  	unsigned		audit_skip : 1; +	/* supports ioprio */ +	unsigned		ioprio : 1; +	/* supports iopoll */ +	unsigned		iopoll : 1;  	/* size of async data needed, if any */  	unsigned short		async_size;  };  static const struct io_op_def io_op_defs[] = { -	[IORING_OP_NOP] = {}, +	[IORING_OP_NOP] = { +		.audit_skip		= 1, +		.iopoll			= 1, +		.buffer_select		= 1, +	},  	[IORING_OP_READV] = {  		.needs_file		= 1,  		.unbound_nonreg_file	= 1, @@ -990,6 +1128,8 @@ static const struct io_op_def io_op_defs[] = {  		.needs_async_setup	= 1,  		.plug			= 1,  		.audit_skip		= 1, +		.ioprio			= 1, +		.iopoll			= 1,  		.async_size		= sizeof(struct io_async_rw),  	},  	[IORING_OP_WRITEV] = { @@ -1000,6 +1140,8 @@ static const struct io_op_def io_op_defs[] = {  		.needs_async_setup	= 1,  		.plug			= 1,  		.audit_skip		= 1, +		.ioprio			= 1, +		.iopoll			= 1,  		.async_size		= sizeof(struct io_async_rw),  	},  	[IORING_OP_FSYNC] = { @@ -1012,6 +1154,8 @@ static const struct io_op_def io_op_defs[] = {  		.pollin			= 1,  		.plug			= 1,  		.audit_skip		= 1, +		.ioprio			= 1, +		.iopoll			= 1,  		.async_size		= sizeof(struct io_async_rw),  	},  	[IORING_OP_WRITE_FIXED] = { @@ -1021,6 +1165,8 @@ static const struct io_op_def io_op_defs[] = {  		.pollout		= 1,  		.plug			= 1,  		.audit_skip		= 1, +		.ioprio			= 1, +		.iopoll			= 1,  		.async_size		= sizeof(struct io_async_rw),  	},  	[IORING_OP_POLL_ADD] = { @@ -1063,6 +1209,7 @@ static const struct io_op_def io_op_defs[] = {  		.unbound_nonreg_file	= 1,  		.pollin			= 1,  		.poll_exclusive		= 1, +		.ioprio			= 1,	/* used for flags */  	},  	[IORING_OP_ASYNC_CANCEL] = {  		.audit_skip		= 1, @@ -1085,6 +1232,7 @@ static const struct io_op_def io_op_defs[] = {  	[IORING_OP_CLOSE] = {},  	[IORING_OP_FILES_UPDATE] = {  		.audit_skip		= 1, +		.iopoll			= 1,  	},  	[IORING_OP_STATX] = {  		.audit_skip		= 1, @@ -1096,6 +1244,8 @@ static const struct io_op_def io_op_defs[] = {  		.buffer_select		= 1,  		.plug			= 1,  		.audit_skip		= 1, +		.ioprio			= 1, +		.iopoll			= 1,  		.async_size		= sizeof(struct io_async_rw),  	},  	[IORING_OP_WRITE] = { @@ -1105,6 +1255,8 @@ static const struct io_op_def io_op_defs[] = {  		.pollout		= 1,  		.plug			= 1,  		.audit_skip		= 1, +		.ioprio			= 1, +		.iopoll			= 1,  		.async_size		= sizeof(struct io_async_rw),  	},  	[IORING_OP_FADVISE] = { @@ -1139,9 +1291,11 @@ static const struct io_op_def io_op_defs[] = {  	},  	[IORING_OP_PROVIDE_BUFFERS] = {  		.audit_skip		= 1, +		.iopoll			= 1,  	},  	[IORING_OP_REMOVE_BUFFERS] = {  		.audit_skip		= 1, +		.iopoll			= 1,  	},  	[IORING_OP_TEE] = {  		.needs_file		= 1, @@ -1159,11 +1313,30 @@ static const struct io_op_def io_op_defs[] = {  	[IORING_OP_LINKAT] = {},  	[IORING_OP_MSG_RING] = {  		.needs_file		= 1, +		.iopoll			= 1, +	}, +	[IORING_OP_FSETXATTR] = { +		.needs_file = 1 +	}, +	[IORING_OP_SETXATTR] = {}, +	[IORING_OP_FGETXATTR] = { +		.needs_file = 1 +	}, +	[IORING_OP_GETXATTR] = {}, +	[IORING_OP_SOCKET] = { +		.audit_skip		= 1, +	}, +	[IORING_OP_URING_CMD] = { +		.needs_file		= 1, +		.plug			= 1, +		.needs_async_setup	= 1, +		.async_size		= uring_cmd_pdu_size(1),  	},  };  /* requests with any of those set should undergo io_disarm_next() */  #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)  static bool io_disarm_next(struct io_kiocb *req);  static void io_uring_del_tctx_node(unsigned long index); @@ -1172,19 +1345,19 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,  					 bool cancel_all);  static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); - -static void io_put_req(struct io_kiocb *req); -static void io_put_req_deferred(struct io_kiocb *req); +static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);  static void io_dismantle_req(struct io_kiocb *req);  static void io_queue_linked_timeout(struct io_kiocb *req);  static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,  				     struct io_uring_rsrc_update2 *up,  				     unsigned nr_args);  static void io_clean_op(struct io_kiocb *req); -static struct file *io_file_get(struct io_ring_ctx *ctx, -				struct io_kiocb *req, int fd, bool fixed); -static void __io_queue_sqe(struct io_kiocb *req); +static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, +					     unsigned issue_flags); +static struct file *io_file_get_normal(struct io_kiocb *req, int fd); +static void io_drop_inflight_file(struct io_kiocb *req); +static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); +static void io_queue_sqe(struct io_kiocb *req);  static void io_rsrc_put_work(struct work_struct *work);  static void io_req_task_queue(struct io_kiocb *req); @@ -1197,11 +1370,115 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);  static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);  static void io_eventfd_signal(struct io_ring_ctx *ctx); +static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);  static struct kmem_cache *req_cachep;  static const struct file_operations io_uring_fops; +const char *io_uring_get_opcode(u8 opcode) +{ +	switch ((enum io_uring_op)opcode) { +	case IORING_OP_NOP: +		return "NOP"; +	case IORING_OP_READV: +		return "READV"; +	case IORING_OP_WRITEV: +		return "WRITEV"; +	case IORING_OP_FSYNC: +		return "FSYNC"; +	case IORING_OP_READ_FIXED: +		return "READ_FIXED"; +	case IORING_OP_WRITE_FIXED: +		return "WRITE_FIXED"; +	case IORING_OP_POLL_ADD: +		return "POLL_ADD"; +	case IORING_OP_POLL_REMOVE: +		return "POLL_REMOVE"; +	case IORING_OP_SYNC_FILE_RANGE: +		return "SYNC_FILE_RANGE"; +	case IORING_OP_SENDMSG: +		return "SENDMSG"; +	case IORING_OP_RECVMSG: +		return "RECVMSG"; +	case IORING_OP_TIMEOUT: +		return "TIMEOUT"; +	case IORING_OP_TIMEOUT_REMOVE: +		return "TIMEOUT_REMOVE"; +	case IORING_OP_ACCEPT: +		return "ACCEPT"; +	case IORING_OP_ASYNC_CANCEL: +		return "ASYNC_CANCEL"; +	case IORING_OP_LINK_TIMEOUT: +		return "LINK_TIMEOUT"; +	case IORING_OP_CONNECT: +		return "CONNECT"; +	case IORING_OP_FALLOCATE: +		return "FALLOCATE"; +	case IORING_OP_OPENAT: +		return "OPENAT"; +	case IORING_OP_CLOSE: +		return "CLOSE"; +	case IORING_OP_FILES_UPDATE: +		return "FILES_UPDATE"; +	case IORING_OP_STATX: +		return "STATX"; +	case IORING_OP_READ: +		return "READ"; +	case IORING_OP_WRITE: +		return "WRITE"; +	case IORING_OP_FADVISE: +		return "FADVISE"; +	case IORING_OP_MADVISE: +		return "MADVISE"; +	case IORING_OP_SEND: +		return "SEND"; +	case IORING_OP_RECV: +		return "RECV"; +	case IORING_OP_OPENAT2: +		return "OPENAT2"; +	case IORING_OP_EPOLL_CTL: +		return "EPOLL_CTL"; +	case IORING_OP_SPLICE: +		return "SPLICE"; +	case IORING_OP_PROVIDE_BUFFERS: +		return "PROVIDE_BUFFERS"; +	case IORING_OP_REMOVE_BUFFERS: +		return "REMOVE_BUFFERS"; +	case IORING_OP_TEE: +		return "TEE"; +	case IORING_OP_SHUTDOWN: +		return "SHUTDOWN"; +	case IORING_OP_RENAMEAT: +		return "RENAMEAT"; +	case IORING_OP_UNLINKAT: +		return "UNLINKAT"; +	case IORING_OP_MKDIRAT: +		return "MKDIRAT"; +	case IORING_OP_SYMLINKAT: +		return "SYMLINKAT"; +	case IORING_OP_LINKAT: +		return "LINKAT"; +	case IORING_OP_MSG_RING: +		return "MSG_RING"; +	case IORING_OP_FSETXATTR: +		return "FSETXATTR"; +	case IORING_OP_SETXATTR: +		return "SETXATTR"; +	case IORING_OP_FGETXATTR: +		return "FGETXATTR"; +	case IORING_OP_GETXATTR: +		return "GETXATTR"; +	case IORING_OP_SOCKET: +		return "SOCKET"; +	case IORING_OP_URING_CMD: +		return "URING_CMD"; +	case IORING_OP_LAST: +		return "INVALID"; +	} +	return "INVALID"; +} +  struct sock *io_uring_get_socket(struct file *file)  {  #if defined(CONFIG_UNIX) @@ -1215,6 +1492,42 @@ struct sock *io_uring_get_socket(struct file *file)  }  EXPORT_SYMBOL(io_uring_get_socket); +#if defined(CONFIG_UNIX) +static inline bool io_file_need_scm(struct file *filp) +{ +#if defined(IO_URING_SCM_ALL) +	return true; +#else +	return !!unix_get_socket(filp); +#endif +} +#else +static inline bool io_file_need_scm(struct file *filp) +{ +	return false; +} +#endif + +static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) +{ +	lockdep_assert_held(&ctx->uring_lock); +	if (issue_flags & IO_URING_F_UNLOCKED) +		mutex_unlock(&ctx->uring_lock); +} + +static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) +{ +	/* +	 * "Normal" inline submissions always hold the uring_lock, since we +	 * grab it from the system call. Same is true for the SQPOLL offload. +	 * The only exception is when we've detached the request and issue it +	 * from an async worker thread, grab the lock for that case. +	 */ +	if (issue_flags & IO_URING_F_UNLOCKED) +		mutex_lock(&ctx->uring_lock); +	lockdep_assert_held(&ctx->uring_lock); +} +  static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)  {  	if (!*locked) { @@ -1276,31 +1589,36 @@ static inline void io_req_set_refcount(struct io_kiocb *req)  #define IO_RSRC_REF_BATCH	100 +static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) +{ +	percpu_ref_put_many(&node->refs, nr); +} +  static inline void io_req_put_rsrc_locked(struct io_kiocb *req,  					  struct io_ring_ctx *ctx)  	__must_hold(&ctx->uring_lock)  { -	struct percpu_ref *ref = req->fixed_rsrc_refs; +	struct io_rsrc_node *node = req->rsrc_node; -	if (ref) { -		if (ref == &ctx->rsrc_node->refs) +	if (node) { +		if (node == ctx->rsrc_node)  			ctx->rsrc_cached_refs++;  		else -			percpu_ref_put(ref); +			io_rsrc_put_node(node, 1);  	}  } -static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) +static inline void io_req_put_rsrc(struct io_kiocb *req)  { -	if (req->fixed_rsrc_refs) -		percpu_ref_put(req->fixed_rsrc_refs); +	if (req->rsrc_node) +		io_rsrc_put_node(req->rsrc_node, 1);  }  static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)  	__must_hold(&ctx->uring_lock)  {  	if (ctx->rsrc_cached_refs) { -		percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); +		io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);  		ctx->rsrc_cached_refs = 0;  	}  } @@ -1313,33 +1631,42 @@ static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)  }  static inline void io_req_set_rsrc_node(struct io_kiocb *req, -					struct io_ring_ctx *ctx) +					struct io_ring_ctx *ctx, +					unsigned int issue_flags)  { -	if (!req->fixed_rsrc_refs) { -		req->fixed_rsrc_refs = &ctx->rsrc_node->refs; -		ctx->rsrc_cached_refs--; -		if (unlikely(ctx->rsrc_cached_refs < 0)) -			io_rsrc_refs_refill(ctx); +	if (!req->rsrc_node) { +		req->rsrc_node = ctx->rsrc_node; + +		if (!(issue_flags & IO_URING_F_UNLOCKED)) { +			lockdep_assert_held(&ctx->uring_lock); +			ctx->rsrc_cached_refs--; +			if (unlikely(ctx->rsrc_cached_refs < 0)) +				io_rsrc_refs_refill(ctx); +		} else { +			percpu_ref_get(&req->rsrc_node->refs); +		}  	}  }  static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)  { -	struct io_buffer *kbuf = req->kbuf; -	unsigned int cflags; +	if (req->flags & REQ_F_BUFFER_RING) { +		if (req->buf_list) +			req->buf_list->head++; +		req->flags &= ~REQ_F_BUFFER_RING; +	} else { +		list_add(&req->kbuf->list, list); +		req->flags &= ~REQ_F_BUFFER_SELECTED; +	} -	cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); -	req->flags &= ~REQ_F_BUFFER_SELECTED; -	list_add(&kbuf->list, list); -	req->kbuf = NULL; -	return cflags; +	return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);  }  static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)  {  	lockdep_assert_held(&req->ctx->completion_lock); -	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) +	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))  		return 0;  	return __io_put_kbuf(req, &req->ctx->io_buffers_comp);  } @@ -1349,7 +1676,7 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,  {  	unsigned int cflags; -	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) +	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))  		return 0;  	/* @@ -1364,7 +1691,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,  	 * We migrate buffers from the comp_list to the issue cache list  	 * when we need one.  	 */ -	if (issue_flags & IO_URING_F_UNLOCKED) { +	if (req->flags & REQ_F_BUFFER_RING) { +		/* no buffers to recycle for this case */ +		cflags = __io_put_kbuf(req, NULL); +	} else if (issue_flags & IO_URING_F_UNLOCKED) {  		struct io_ring_ctx *ctx = req->ctx;  		spin_lock(&ctx->completion_lock); @@ -1382,15 +1712,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,  static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,  						 unsigned int bgid)  { -	struct list_head *hash_list; -	struct io_buffer_list *bl; - -	hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; -	list_for_each_entry(bl, hash_list, list) -		if (bl->bgid == bgid || bgid == -1U) -			return bl; +	if (ctx->io_bl && bgid < BGID_ARRAY) +		return &ctx->io_bl[bgid]; -	return NULL; +	return xa_load(&ctx->io_bl_xa, bgid);  }  static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) @@ -1399,54 +1724,42 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)  	struct io_buffer_list *bl;  	struct io_buffer *buf; -	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) +	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))  		return;  	/* don't recycle if we already did IO to this buffer */  	if (req->flags & REQ_F_PARTIAL_IO)  		return; +	/* +	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear +	 * the flag and hence ensure that bl->head doesn't get incremented. +	 * If the tail has already been incremented, hang on to it. +	 */ +	if (req->flags & REQ_F_BUFFER_RING) { +		if (req->buf_list) { +			req->buf_index = req->buf_list->bgid; +			req->flags &= ~REQ_F_BUFFER_RING; +		} +		return; +	} -	if (issue_flags & IO_URING_F_UNLOCKED) -		mutex_lock(&ctx->uring_lock); - -	lockdep_assert_held(&ctx->uring_lock); +	io_ring_submit_lock(ctx, issue_flags);  	buf = req->kbuf;  	bl = io_buffer_get_list(ctx, buf->bgid);  	list_add(&buf->list, &bl->buf_list);  	req->flags &= ~REQ_F_BUFFER_SELECTED; -	req->kbuf = NULL; +	req->buf_index = buf->bgid; -	if (issue_flags & IO_URING_F_UNLOCKED) -		mutex_unlock(&ctx->uring_lock); +	io_ring_submit_unlock(ctx, issue_flags);  }  static bool io_match_task(struct io_kiocb *head, struct task_struct *task,  			  bool cancel_all)  	__must_hold(&req->ctx->timeout_lock)  { -	struct io_kiocb *req; -  	if (task && head->task != task)  		return false; -	if (cancel_all) -		return true; - -	io_for_each_link(req, head) { -		if (req->flags & REQ_F_INFLIGHT) -			return true; -	} -	return false; -} - -static bool io_match_linked(struct io_kiocb *head) -{ -	struct io_kiocb *req; - -	io_for_each_link(req, head) { -		if (req->flags & REQ_F_INFLIGHT) -			return true; -	} -	return false; +	return cancel_all;  }  /* @@ -1456,24 +1769,9 @@ static bool io_match_linked(struct io_kiocb *head)  static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,  			       bool cancel_all)  { -	bool matched; -  	if (task && head->task != task)  		return false; -	if (cancel_all) -		return true; - -	if (head->flags & REQ_F_LINK_TIMEOUT) { -		struct io_ring_ctx *ctx = head->ctx; - -		/* protect against races with linked timeouts */ -		spin_lock_irq(&ctx->timeout_lock); -		matched = io_match_linked(head); -		spin_unlock_irq(&ctx->timeout_lock); -	} else { -		matched = io_match_linked(head); -	} -	return matched; +	return cancel_all;  }  static inline bool req_has_async_data(struct io_kiocb *req) @@ -1493,7 +1791,12 @@ static inline void req_set_fail(struct io_kiocb *req)  static inline void req_fail_link_node(struct io_kiocb *req, int res)  {  	req_set_fail(req); -	req->result = res; +	req->cqe.res = res; +} + +static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ +	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);  }  static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -1530,12 +1833,14 @@ static __cold void io_fallback_req_func(struct work_struct *work)  static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)  {  	struct io_ring_ctx *ctx; -	int i, hash_bits; +	int hash_bits;  	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);  	if (!ctx)  		return NULL; +	xa_init(&ctx->io_bl_xa); +  	/*  	 * Use 5 bits less than the max cq entries, that should give us around  	 * 32 entries per hash list if totally full and uniformly spread. @@ -1557,13 +1862,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)  	/* set invalid range, so io_import_fixed() fails meeting it */  	ctx->dummy_ubuf->ubuf = -1UL; -	ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, -					sizeof(struct list_head), GFP_KERNEL); -	if (!ctx->io_buffers) -		goto err; -	for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) -		INIT_LIST_HEAD(&ctx->io_buffers[i]); -  	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,  			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))  		goto err; @@ -1595,15 +1893,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)  	INIT_WQ_LIST(&ctx->locked_free_list);  	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);  	INIT_WQ_LIST(&ctx->submit_state.compl_reqs); -#ifdef CONFIG_NET_RX_BUSY_POLL -	INIT_LIST_HEAD(&ctx->napi_list); -	spin_lock_init(&ctx->napi_lock); -#endif  	return ctx;  err:  	kfree(ctx->dummy_ubuf);  	kfree(ctx->cancel_hash); -	kfree(ctx->io_buffers); +	kfree(ctx->io_bl); +	xa_destroy(&ctx->io_bl_xa);  	kfree(ctx);  	return NULL;  } @@ -1627,23 +1922,11 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)  	return false;  } -#define FFS_NOWAIT		0x1UL -#define FFS_ISREG		0x2UL -#define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG) -  static inline bool io_req_ffs_set(struct io_kiocb *req)  {  	return req->flags & REQ_F_FIXED_FILE;  } -static inline void io_req_track_inflight(struct io_kiocb *req) -{ -	if (!(req->flags & REQ_F_INFLIGHT)) { -		req->flags |= REQ_F_INFLIGHT; -		atomic_inc(¤t->io_uring->inflight_tracked); -	} -} -  static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)  {  	if (WARN_ON_ONCE(!req->link)) @@ -1665,6 +1948,17 @@ static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)  	return __io_prep_linked_timeout(req);  } +static noinline void __io_arm_ltimeout(struct io_kiocb *req) +{ +	io_queue_linked_timeout(__io_prep_linked_timeout(req)); +} + +static inline void io_arm_ltimeout(struct io_kiocb *req) +{ +	if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) +		__io_arm_ltimeout(req); +} +  static void io_prep_async_work(struct io_kiocb *req)  {  	const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1677,6 +1971,7 @@ static void io_prep_async_work(struct io_kiocb *req)  	req->work.list.next = NULL;  	req->work.flags = 0; +	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);  	if (req->flags & REQ_F_FORCE_ASYNC)  		req->work.flags |= IO_WQ_WORK_CONCURRENT; @@ -1687,14 +1982,6 @@ static void io_prep_async_work(struct io_kiocb *req)  		if (def->unbound_nonreg_file)  			req->work.flags |= IO_WQ_WORK_UNBOUND;  	} - -	switch (req->opcode) { -	case IORING_OP_SPLICE: -	case IORING_OP_TEE: -		if (!S_ISREG(file_inode(req->splice.file_in)->i_mode)) -			req->work.flags |= IO_WQ_WORK_UNBOUND; -		break; -	}  }  static void io_prep_async_link(struct io_kiocb *req) @@ -1716,17 +2003,15 @@ static void io_prep_async_link(struct io_kiocb *req)  static inline void io_req_add_compl_list(struct io_kiocb *req)  { -	struct io_ring_ctx *ctx = req->ctx; -	struct io_submit_state *state = &ctx->submit_state; +	struct io_submit_state *state = &req->ctx->submit_state;  	if (!(req->flags & REQ_F_CQE_SKIP)) -		ctx->submit_state.flush_cqes = true; +		state->flush_cqes = true;  	wq_list_add_tail(&req->comp_list, &state->compl_reqs);  } -static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) +static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)  { -	struct io_ring_ctx *ctx = req->ctx;  	struct io_kiocb *link = io_prep_linked_timeout(req);  	struct io_uring_task *tctx = req->task->io_uring; @@ -1746,8 +2031,9 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)  	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))  		req->work.flags |= IO_WQ_WORK_CANCEL; -	trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, -					&req->work, io_wq_is_hashed(&req->work)); +	trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, +					req->opcode, req->flags, &req->work, +					io_wq_is_hashed(&req->work));  	io_wq_enqueue(tctx->io_wq, &req->work);  	if (link)  		io_queue_linked_timeout(link); @@ -1765,8 +2051,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)  		atomic_set(&req->ctx->cq_timeouts,  			atomic_read(&req->ctx->cq_timeouts) + 1);  		list_del_init(&req->timeout.list); -		io_fill_cqe_req(req, status, 0); -		io_put_req_deferred(req); +		io_req_tw_post_queue(req, status, 0);  	}  } @@ -1788,12 +2073,11 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)  	__must_hold(&ctx->completion_lock)  {  	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); +	struct io_kiocb *req, *tmp;  	spin_lock_irq(&ctx->timeout_lock); -	while (!list_empty(&ctx->timeout_list)) { +	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {  		u32 events_needed, events_got; -		struct io_kiocb *req = list_first_entry(&ctx->timeout_list, -						struct io_kiocb, timeout.list);  		if (io_is_timeout_noseq(req))  			break; @@ -1810,7 +2094,6 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)  		if (events_got < events_needed)  			break; -		list_del_init(&req->timeout.list);  		io_kill_timeout(req, 0);  	}  	ctx->cq_last_tm_flush = seq; @@ -1850,21 +2133,53 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)  	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);  } -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +/* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ +static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)  {  	struct io_rings *rings = ctx->rings; -	unsigned tail, mask = ctx->cq_entries - 1; +	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); +	unsigned int shift = 0; +	unsigned int free, queued, len; -	/* -	 * writes to the cq entry need to come after reading head; the -	 * control dependency is enough as we're using WRITE_ONCE to -	 * fill the cq entry -	 */ -	if (__io_cqring_events(ctx) == ctx->cq_entries) +	if (ctx->flags & IORING_SETUP_CQE32) +		shift = 1; + +	/* userspace may cheat modifying the tail, be safe and do min */ +	queued = min(__io_cqring_events(ctx), ctx->cq_entries); +	free = ctx->cq_entries - queued; +	/* we need a contiguous range, limit based on the current array offset */ +	len = min(free, ctx->cq_entries - off); +	if (!len)  		return NULL; -	tail = ctx->cached_cq_tail++; -	return &rings->cqes[tail & mask]; +	ctx->cached_cq_tail++; +	ctx->cqe_cached = &rings->cqes[off]; +	ctx->cqe_sentinel = ctx->cqe_cached + len; +	ctx->cqe_cached++; +	return &rings->cqes[off << shift]; +} + +static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +{ +	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { +		struct io_uring_cqe *cqe = ctx->cqe_cached; + +		if (ctx->flags & IORING_SETUP_CQE32) { +			unsigned int off = ctx->cqe_cached - ctx->rings->cqes; + +			cqe += off; +		} + +		ctx->cached_cq_tail++; +		ctx->cqe_cached++; +		return cqe; +	} + +	return __io_get_cqe(ctx);  }  static void io_eventfd_signal(struct io_ring_ctx *ctx) @@ -1935,10 +2250,14 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)  static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)  {  	bool all_flushed, posted; +	size_t cqe_size = sizeof(struct io_uring_cqe);  	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)  		return false; +	if (ctx->flags & IORING_SETUP_CQE32) +		cqe_size <<= 1; +  	posted = false;  	spin_lock(&ctx->completion_lock);  	while (!list_empty(&ctx->cq_overflow_list)) { @@ -1950,7 +2269,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)  		ocqe = list_first_entry(&ctx->cq_overflow_list,  					struct io_overflow_cqe, list);  		if (cqe) -			memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); +			memcpy(cqe, &ocqe->cqe, cqe_size);  		else  			io_account_cq_overflow(ctx); @@ -1961,13 +2280,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)  	all_flushed = list_empty(&ctx->cq_overflow_list);  	if (all_flushed) { -		clear_bit(0, &ctx->check_cq_overflow); -		WRITE_ONCE(ctx->rings->sq_flags, -			   ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); +		clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); +		atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);  	} -	if (posted) -		io_commit_cqring(ctx); +	io_commit_cqring(ctx);  	spin_unlock(&ctx->completion_lock);  	if (posted)  		io_cqring_ev_posted(ctx); @@ -1978,7 +2295,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)  {  	bool ret = true; -	if (test_bit(0, &ctx->check_cq_overflow)) { +	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {  		/* iopoll syncs against uring_lock, not completion_lock */  		if (ctx->flags & IORING_SETUP_IOPOLL)  			mutex_lock(&ctx->uring_lock); @@ -1990,19 +2307,23 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)  	return ret;  } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) +static void __io_put_task(struct task_struct *task, int nr)  {  	struct io_uring_task *tctx = task->io_uring; -	if (likely(task == current)) { -		tctx->cached_refs += nr; -	} else { -		percpu_counter_sub(&tctx->inflight, nr); -		if (unlikely(atomic_read(&tctx->in_idle))) -			wake_up(&tctx->wait); -		put_task_struct_many(task, nr); -	} +	percpu_counter_sub(&tctx->inflight, nr); +	if (unlikely(atomic_read(&tctx->in_idle))) +		wake_up(&tctx->wait); +	put_task_struct_many(task, nr); +} + +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ +	if (likely(task == current)) +		task->io_uring->cached_refs += nr; +	else +		__io_put_task(task, nr);  }  static void io_task_refs_refill(struct io_uring_task *tctx) @@ -2036,11 +2357,18 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)  }  static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, -				     s32 res, u32 cflags) +				     s32 res, u32 cflags, u64 extra1, +				     u64 extra2)  {  	struct io_overflow_cqe *ocqe; +	size_t ocq_size = sizeof(struct io_overflow_cqe); +	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); -	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); +	if (is_cqe32) +		ocq_size += sizeof(struct io_uring_cqe); + +	ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); +	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);  	if (!ocqe) {  		/*  		 * If we're in ring overflow flush mode, or in task cancel mode, @@ -2048,17 +2376,21 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,  		 * on the floor.  		 */  		io_account_cq_overflow(ctx); +		set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);  		return false;  	}  	if (list_empty(&ctx->cq_overflow_list)) { -		set_bit(0, &ctx->check_cq_overflow); -		WRITE_ONCE(ctx->rings->sq_flags, -			   ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); +		set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); +		atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);  	}  	ocqe->cqe.user_data = user_data;  	ocqe->cqe.res = res;  	ocqe->cqe.flags = cflags; +	if (is_cqe32) { +		ocqe->cqe.big_cqe[0] = extra1; +		ocqe->cqe.big_cqe[1] = extra2; +	}  	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);  	return true;  } @@ -2080,42 +2412,114 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,  		WRITE_ONCE(cqe->flags, cflags);  		return true;  	} -	return io_cqring_event_overflow(ctx, user_data, res, cflags); +	return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); +} + +static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, +					    struct io_kiocb *req) +{ +	struct io_uring_cqe *cqe; + +	trace_io_uring_complete(req->ctx, req, req->cqe.user_data, +				req->cqe.res, req->cqe.flags, 0, 0); + +	/* +	 * If we can't get a cq entry, userspace overflowed the +	 * submission (by quite a lot). Increment the overflow count in +	 * the ring. +	 */ +	cqe = io_get_cqe(ctx); +	if (likely(cqe)) { +		memcpy(cqe, &req->cqe, sizeof(*cqe)); +		return true; +	} +	return io_cqring_event_overflow(ctx, req->cqe.user_data, +					req->cqe.res, req->cqe.flags, 0, 0); +} + +static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx, +					      struct io_kiocb *req) +{ +	struct io_uring_cqe *cqe; +	u64 extra1 = req->extra1; +	u64 extra2 = req->extra2; + +	trace_io_uring_complete(req->ctx, req, req->cqe.user_data, +				req->cqe.res, req->cqe.flags, extra1, extra2); + +	/* +	 * If we can't get a cq entry, userspace overflowed the +	 * submission (by quite a lot). Increment the overflow count in +	 * the ring. +	 */ +	cqe = io_get_cqe(ctx); +	if (likely(cqe)) { +		memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); +		cqe->big_cqe[0] = extra1; +		cqe->big_cqe[1] = extra2; +		return true; +	} + +	return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res, +					req->cqe.flags, extra1, extra2);  }  static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)  { -	trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); -	return __io_fill_cqe(req->ctx, req->user_data, res, cflags); +	trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0); +	return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);  } -static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) +static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags, +				u64 extra1, u64 extra2)  { -	if (!(req->flags & REQ_F_CQE_SKIP)) -		__io_fill_cqe_req(req, res, cflags); +	struct io_ring_ctx *ctx = req->ctx; +	struct io_uring_cqe *cqe; + +	if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) +		return; +	if (req->flags & REQ_F_CQE_SKIP) +		return; + +	trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags, +				extra1, extra2); + +	/* +	 * If we can't get a cq entry, userspace overflowed the +	 * submission (by quite a lot). Increment the overflow count in +	 * the ring. +	 */ +	cqe = io_get_cqe(ctx); +	if (likely(cqe)) { +		WRITE_ONCE(cqe->user_data, req->cqe.user_data); +		WRITE_ONCE(cqe->res, res); +		WRITE_ONCE(cqe->flags, cflags); +		WRITE_ONCE(cqe->big_cqe[0], extra1); +		WRITE_ONCE(cqe->big_cqe[1], extra2); +		return; +	} + +	io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2);  }  static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,  				     s32 res, u32 cflags)  {  	ctx->cq_extra++; -	trace_io_uring_complete(ctx, NULL, user_data, res, cflags); +	trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);  	return __io_fill_cqe(ctx, user_data, res, cflags);  } -static void __io_req_complete_post(struct io_kiocb *req, s32 res, -				   u32 cflags) +static void __io_req_complete_put(struct io_kiocb *req)  { -	struct io_ring_ctx *ctx = req->ctx; - -	if (!(req->flags & REQ_F_CQE_SKIP)) -		__io_fill_cqe_req(req, res, cflags);  	/*  	 * If we're the last reference to this request, add to our locked  	 * free_list cache.  	 */  	if (req_ref_put_and_test(req)) { -		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { +		struct io_ring_ctx *ctx = req->ctx; + +		if (req->flags & IO_REQ_LINK_FLAGS) {  			if (req->flags & IO_DISARM_MASK)  				io_disarm_next(req);  			if (req->link) { @@ -2123,7 +2527,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,  				req->link = NULL;  			}  		} -		io_req_put_rsrc(req, ctx); +		io_req_put_rsrc(req);  		/*  		 * Selected buffer deallocation in io_clean_op() assumes that  		 * we don't hold ->completion_lock. Clean them here to avoid @@ -2137,8 +2541,23 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,  	}  } -static void io_req_complete_post(struct io_kiocb *req, s32 res, -				 u32 cflags) +static void __io_req_complete_post(struct io_kiocb *req, s32 res, +				   u32 cflags) +{ +	if (!(req->flags & REQ_F_CQE_SKIP)) +		__io_fill_cqe_req(req, res, cflags); +	__io_req_complete_put(req); +} + +static void __io_req_complete_post32(struct io_kiocb *req, s32 res, +				   u32 cflags, u64 extra1, u64 extra2) +{ +	if (!(req->flags & REQ_F_CQE_SKIP)) +		__io_fill_cqe32_req(req, res, cflags, extra1, extra2); +	__io_req_complete_put(req); +} + +static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)  {  	struct io_ring_ctx *ctx = req->ctx; @@ -2149,11 +2568,23 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,  	io_cqring_ev_posted(ctx);  } +static void io_req_complete_post32(struct io_kiocb *req, s32 res, +				   u32 cflags, u64 extra1, u64 extra2) +{ +	struct io_ring_ctx *ctx = req->ctx; + +	spin_lock(&ctx->completion_lock); +	__io_req_complete_post32(req, res, cflags, extra1, extra2); +	io_commit_cqring(ctx); +	spin_unlock(&ctx->completion_lock); +	io_cqring_ev_posted(ctx); +} +  static inline void io_req_complete_state(struct io_kiocb *req, s32 res,  					 u32 cflags)  { -	req->result = res; -	req->cflags = cflags; +	req->cqe.res = res; +	req->cqe.flags = cflags;  	req->flags |= REQ_F_COMPLETE_INLINE;  } @@ -2166,8 +2597,23 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,  		io_req_complete_post(req, res, cflags);  } +static inline void __io_req_complete32(struct io_kiocb *req, +				       unsigned int issue_flags, s32 res, +				       u32 cflags, u64 extra1, u64 extra2) +{ +	if (issue_flags & IO_URING_F_COMPLETE_DEFER) { +		io_req_complete_state(req, res, cflags); +		req->extra1 = extra1; +		req->extra2 = extra2; +	} else { +		io_req_complete_post32(req, res, cflags, extra1, extra2); +	} +} +  static inline void io_req_complete(struct io_kiocb *req, s32 res)  { +	if (res < 0) +		req_set_fail(req);  	__io_req_complete(req, 0, res, 0);  } @@ -2177,17 +2623,6 @@ static void io_req_complete_failed(struct io_kiocb *req, s32 res)  	io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));  } -static void io_req_complete_fail_submit(struct io_kiocb *req) -{ -	/* -	 * We don't submit, fail them all, for that replace hardlinks with -	 * normal links. Extra REQ_F_LINK is tolerated. -	 */ -	req->flags &= ~REQ_F_HARDLINK; -	req->flags |= REQ_F_LINK; -	io_req_complete_failed(req, req->result); -} -  /*   * Don't initialise the fields below on every allocation, but do that in   * advance and keep them valid across allocations. @@ -2198,7 +2633,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)  	req->link = NULL;  	req->async_data = NULL;  	/* not necessary, but safer to zero */ -	req->result = 0; +	req->cqe.res = 0;  }  static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, @@ -2210,19 +2645,9 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,  	spin_unlock(&ctx->completion_lock);  } -/* Returns true IFF there are requests in the cache */ -static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) +static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)  { -	struct io_submit_state *state = &ctx->submit_state; - -	/* -	 * If we have more than a batch's worth of requests in our IRQ side -	 * locked cache, grab the lock and move them over to our submission -	 * side cache. -	 */ -	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) -		io_flush_cached_locked_reqs(ctx, state); -	return !!state->free_list.next; +	return !ctx->submit_state.free_list.next;  }  /* @@ -2234,14 +2659,20 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)  static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)  	__must_hold(&ctx->uring_lock)  { -	struct io_submit_state *state = &ctx->submit_state;  	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;  	void *reqs[IO_REQ_ALLOC_BATCH]; -	struct io_kiocb *req;  	int ret, i; -	if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) -		return true; +	/* +	 * If we have more than a batch's worth of requests in our IRQ side +	 * locked cache, grab the lock and move them over to our submission +	 * side cache. +	 */ +	if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { +		io_flush_cached_locked_reqs(ctx, &ctx->submit_state); +		if (!io_req_cache_empty(ctx)) +			return true; +	}  	ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); @@ -2258,17 +2689,17 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)  	percpu_ref_get_many(&ctx->refs, ret);  	for (i = 0; i < ret; i++) { -		req = reqs[i]; +		struct io_kiocb *req = reqs[i];  		io_preinit_req(req, ctx); -		wq_stack_add_head(&req->comp_list, &state->free_list); +		io_req_add_to_cache(req, ctx);  	}  	return true;  }  static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)  { -	if (unlikely(!ctx->submit_state.free_list.next)) +	if (unlikely(io_req_cache_empty(ctx)))  		return __io_alloc_req_refill(ctx);  	return true;  } @@ -2297,11 +2728,11 @@ static inline void io_dismantle_req(struct io_kiocb *req)  		io_put_file(req->file);  } -static __cold void __io_free_req(struct io_kiocb *req) +static __cold void io_free_req(struct io_kiocb *req)  {  	struct io_ring_ctx *ctx = req->ctx; -	io_req_put_rsrc(req, ctx); +	io_req_put_rsrc(req);  	io_dismantle_req(req);  	io_put_task(req->task, 1); @@ -2319,7 +2750,7 @@ static inline void io_remove_next_linked(struct io_kiocb *req)  	nxt->link = NULL;  } -static bool io_kill_linked_timeout(struct io_kiocb *req) +static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)  	__must_hold(&req->ctx->completion_lock)  	__must_hold(&req->ctx->timeout_lock)  { @@ -2332,13 +2763,10 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)  		link->timeout.head = NULL;  		if (hrtimer_try_to_cancel(&io->timer) != -1) {  			list_del(&link->timeout.list); -			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ -			io_fill_cqe_req(link, -ECANCELED, 0); -			io_put_req_deferred(link); -			return true; +			return link;  		}  	} -	return false; +	return NULL;  }  static void io_fail_links(struct io_kiocb *req) @@ -2352,19 +2780,19 @@ static void io_fail_links(struct io_kiocb *req)  		long res = -ECANCELED;  		if (link->flags & REQ_F_FAIL) -			res = link->result; +			res = link->cqe.res;  		nxt = link->link;  		link->link = NULL; -		trace_io_uring_fail_link(req->ctx, req, req->user_data, +		trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,  					req->opcode, link); -		if (!ignore_cqes) { +		if (ignore_cqes) +			link->flags |= REQ_F_CQE_SKIP; +		else  			link->flags &= ~REQ_F_CQE_SKIP; -			io_fill_cqe_req(link, res, 0); -		} -		io_put_req_deferred(link); +		__io_req_complete_post(link, res, 0);  		link = nxt;  	}  } @@ -2372,25 +2800,27 @@ static void io_fail_links(struct io_kiocb *req)  static bool io_disarm_next(struct io_kiocb *req)  	__must_hold(&req->ctx->completion_lock)  { +	struct io_kiocb *link = NULL;  	bool posted = false;  	if (req->flags & REQ_F_ARM_LTIMEOUT) { -		struct io_kiocb *link = req->link; - +		link = req->link;  		req->flags &= ~REQ_F_ARM_LTIMEOUT;  		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {  			io_remove_next_linked(req); -			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ -			io_fill_cqe_req(link, -ECANCELED, 0); -			io_put_req_deferred(link); +			io_req_tw_post_queue(link, -ECANCELED, 0);  			posted = true;  		}  	} else if (req->flags & REQ_F_LINK_TIMEOUT) {  		struct io_ring_ctx *ctx = req->ctx;  		spin_lock_irq(&ctx->timeout_lock); -		posted = io_kill_linked_timeout(req); +		link = io_disarm_linked_timeout(req);  		spin_unlock_irq(&ctx->timeout_lock); +		if (link) { +			posted = true; +			io_req_tw_post_queue(link, -ECANCELED, 0); +		}  	}  	if (unlikely((req->flags & REQ_F_FAIL) &&  		     !(req->flags & REQ_F_HARDLINK))) { @@ -2407,8 +2837,7 @@ static void __io_req_find_next_prep(struct io_kiocb *req)  	spin_lock(&ctx->completion_lock);  	posted = io_disarm_next(req); -	if (posted) -		io_commit_cqring(ctx); +	io_commit_cqring(ctx);  	spin_unlock(&ctx->completion_lock);  	if (posted)  		io_cqring_ev_posted(ctx); @@ -2418,8 +2847,6 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)  {  	struct io_kiocb *nxt; -	if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) -		return NULL;  	/*  	 * If LINK is set, we have dependent requests in this chain. If we  	 * didn't fail this request, queue the first one up, moving any other @@ -2437,6 +2864,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)  {  	if (!ctx)  		return; +	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) +		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);  	if (*locked) {  		io_submit_flush_completions(ctx);  		mutex_unlock(&ctx->uring_lock); @@ -2480,7 +2909,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,  		if (likely(*uring_locked))  			req->io_task_work.func(req, uring_locked);  		else -			__io_req_complete_post(req, req->result, +			__io_req_complete_post(req, req->cqe.res,  						io_put_kbuf_comp(req));  		node = next;  	} while (node); @@ -2521,15 +2950,11 @@ static void tctx_task_work(struct callback_head *cb)  	while (1) {  		struct io_wq_work_node *node1, *node2; -		if (!tctx->task_list.first && -		    !tctx->prior_task_list.first && uring_locked) -			io_submit_flush_completions(ctx); -  		spin_lock_irq(&tctx->task_lock); -		node1 = tctx->prior_task_list.first; +		node1 = tctx->prio_task_list.first;  		node2 = tctx->task_list.first;  		INIT_WQ_LIST(&tctx->task_list); -		INIT_WQ_LIST(&tctx->prior_task_list); +		INIT_WQ_LIST(&tctx->prio_task_list);  		if (!node2 && !node1)  			tctx->task_running = false;  		spin_unlock_irq(&tctx->task_lock); @@ -2538,10 +2963,13 @@ static void tctx_task_work(struct callback_head *cb)  		if (node1)  			handle_prev_tw_list(node1, &ctx, &uring_locked); -  		if (node2)  			handle_tw_list(node2, &ctx, &uring_locked);  		cond_resched(); + +		if (data_race(!tctx->task_list.first) && +		    data_race(!tctx->prio_task_list.first) && uring_locked) +			io_submit_flush_completions(ctx);  	}  	ctx_flush_and_put(ctx, &uring_locked); @@ -2551,22 +2979,19 @@ static void tctx_task_work(struct callback_head *cb)  		io_uring_drop_tctx_refs(current);  } -static void io_req_task_work_add(struct io_kiocb *req, bool priority) +static void __io_req_task_work_add(struct io_kiocb *req, +				   struct io_uring_task *tctx, +				   struct io_wq_work_list *list)  { -	struct task_struct *tsk = req->task; -	struct io_uring_task *tctx = tsk->io_uring; -	enum task_work_notify_mode notify; +	struct io_ring_ctx *ctx = req->ctx;  	struct io_wq_work_node *node;  	unsigned long flags;  	bool running; -	WARN_ON_ONCE(!tctx); +	io_drop_inflight_file(req);  	spin_lock_irqsave(&tctx->task_lock, flags); -	if (priority) -		wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list); -	else -		wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); +	wq_list_add_tail(&req->io_task_work.node, list);  	running = tctx->task_running;  	if (!running)  		tctx->task_running = true; @@ -2576,22 +3001,15 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)  	if (running)  		return; -	/* -	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For -	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're -	 * processing task_work. There's no reliable way to tell if TWA_RESUME -	 * will do the job. -	 */ -	notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; -	if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { -		if (notify == TWA_NONE) -			wake_up_process(tsk); +	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) +		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + +	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))  		return; -	}  	spin_lock_irqsave(&tctx->task_lock, flags);  	tctx->task_running = false; -	node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list); +	node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);  	spin_unlock_irqrestore(&tctx->task_lock, flags);  	while (node) { @@ -2603,47 +3021,73 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)  	}  } -static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +static void io_req_task_work_add(struct io_kiocb *req)  { -	struct io_ring_ctx *ctx = req->ctx; +	struct io_uring_task *tctx = req->task->io_uring; + +	__io_req_task_work_add(req, tctx, &tctx->task_list); +} + +static void io_req_task_prio_work_add(struct io_kiocb *req) +{ +	struct io_uring_task *tctx = req->task->io_uring; +	if (req->ctx->flags & IORING_SETUP_SQPOLL) +		__io_req_task_work_add(req, tctx, &tctx->prio_task_list); +	else +		__io_req_task_work_add(req, tctx, &tctx->task_list); +} + +static void io_req_tw_post(struct io_kiocb *req, bool *locked) +{ +	io_req_complete_post(req, req->cqe.res, req->cqe.flags); +} + +static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) +{ +	req->cqe.res = res; +	req->cqe.flags = cflags; +	req->io_task_work.func = io_req_tw_post; +	io_req_task_work_add(req); +} + +static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +{  	/* not needed for normal modes, but SQPOLL depends on it */ -	io_tw_lock(ctx, locked); -	io_req_complete_failed(req, req->result); +	io_tw_lock(req->ctx, locked); +	io_req_complete_failed(req, req->cqe.res);  }  static void io_req_task_submit(struct io_kiocb *req, bool *locked)  { -	struct io_ring_ctx *ctx = req->ctx; - -	io_tw_lock(ctx, locked); +	io_tw_lock(req->ctx, locked);  	/* req->task == current here, checking PF_EXITING is safe */  	if (likely(!(req->task->flags & PF_EXITING))) -		__io_queue_sqe(req); +		io_queue_sqe(req);  	else  		io_req_complete_failed(req, -EFAULT);  }  static void io_req_task_queue_fail(struct io_kiocb *req, int ret)  { -	req->result = ret; +	req->cqe.res = ret;  	req->io_task_work.func = io_req_task_cancel; -	io_req_task_work_add(req, false); +	io_req_task_work_add(req);  }  static void io_req_task_queue(struct io_kiocb *req)  {  	req->io_task_work.func = io_req_task_submit; -	io_req_task_work_add(req, false); +	io_req_task_work_add(req);  }  static void io_req_task_queue_reissue(struct io_kiocb *req)  { -	req->io_task_work.func = io_queue_async_work; -	io_req_task_work_add(req, false); +	req->io_task_work.func = io_queue_iowq; +	io_req_task_work_add(req);  } -static inline void io_queue_next(struct io_kiocb *req) +static void io_queue_next(struct io_kiocb *req)  {  	struct io_kiocb *nxt = io_req_find_next(req); @@ -2651,17 +3095,6 @@ static inline void io_queue_next(struct io_kiocb *req)  		io_req_task_queue(nxt);  } -static void io_free_req(struct io_kiocb *req) -{ -	io_queue_next(req); -	__io_free_req(req); -} - -static void io_free_req_work(struct io_kiocb *req, bool *locked) -{ -	io_free_req(req); -} -  static void io_free_batch_list(struct io_ring_ctx *ctx,  				struct io_wq_work_node *node)  	__must_hold(&ctx->uring_lock) @@ -2673,15 +3106,30 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,  		struct io_kiocb *req = container_of(node, struct io_kiocb,  						    comp_list); -		if (unlikely(req->flags & REQ_F_REFCOUNT)) { -			node = req->comp_list.next; -			if (!req_ref_put_and_test(req)) -				continue; +		if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { +			if (req->flags & REQ_F_REFCOUNT) { +				node = req->comp_list.next; +				if (!req_ref_put_and_test(req)) +					continue; +			} +			if ((req->flags & REQ_F_POLLED) && req->apoll) { +				struct async_poll *apoll = req->apoll; + +				if (apoll->double_poll) +					kfree(apoll->double_poll); +				list_add(&apoll->poll.wait.entry, +						&ctx->apoll_cache); +				req->flags &= ~REQ_F_POLLED; +			} +			if (req->flags & IO_REQ_LINK_FLAGS) +				io_queue_next(req); +			if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) +				io_clean_op(req);  		} +		if (!(req->flags & REQ_F_FIXED_FILE)) +			io_put_file(req->file);  		io_req_put_rsrc_locked(req, ctx); -		io_queue_next(req); -		io_dismantle_req(req);  		if (req->task != task) {  			if (task) @@ -2691,7 +3139,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,  		}  		task_refs++;  		node = req->comp_list.next; -		wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); +		io_req_add_to_cache(req, ctx);  	} while (node);  	if (task) @@ -2710,16 +3158,11 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)  			struct io_kiocb *req = container_of(node, struct io_kiocb,  						    comp_list); -			if (!(req->flags & REQ_F_CQE_SKIP)) -				__io_fill_cqe_req(req, req->result, req->cflags); -			if ((req->flags & REQ_F_POLLED) && req->apoll) { -				struct async_poll *apoll = req->apoll; - -				if (apoll->double_poll) -					kfree(apoll->double_poll); -				list_add(&apoll->poll.wait.entry, -						&ctx->apoll_cache); -				req->flags &= ~REQ_F_POLLED; +			if (!(req->flags & REQ_F_CQE_SKIP)) { +				if (!(ctx->flags & IORING_SETUP_CQE32)) +					__io_fill_cqe_req_filled(ctx, req); +				else +					__io_fill_cqe32_req_filled(ctx, req);  			}  		} @@ -2742,23 +3185,18 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)  	struct io_kiocb *nxt = NULL;  	if (req_ref_put_and_test(req)) { -		nxt = io_req_find_next(req); -		__io_free_req(req); +		if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) +			nxt = io_req_find_next(req); +		io_free_req(req);  	}  	return nxt;  }  static inline void io_put_req(struct io_kiocb *req)  { -	if (req_ref_put_and_test(req)) -		io_free_req(req); -} - -static inline void io_put_req_deferred(struct io_kiocb *req) -{  	if (req_ref_put_and_test(req)) { -		req->io_task_work.func = io_free_req_work; -		io_req_task_work_add(req, false); +		io_queue_next(req); +		io_free_req(req);  	}  } @@ -2841,11 +3279,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)  		/* order with io_complete_rw_iopoll(), e.g. ->result updates */  		if (!smp_load_acquire(&req->iopoll_completed))  			break; +		nr_events++;  		if (unlikely(req->flags & REQ_F_CQE_SKIP))  			continue; - -		__io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); -		nr_events++; +		__io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0));  	}  	if (unlikely(!nr_events)) @@ -2891,22 +3328,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)  {  	unsigned int nr_events = 0;  	int ret = 0; +	unsigned long check_cq;  	/* -	 * We disallow the app entering submit/complete with polling, but we -	 * still need to lock the ring to prevent racing with polled issue -	 * that got punted to a workqueue. -	 */ -	mutex_lock(&ctx->uring_lock); -	/*  	 * Don't enter poll loop if we already have events pending.  	 * If we do, we can potentially be spinning for commands that  	 * already triggered a CQE (eg in error).  	 */ -	if (test_bit(0, &ctx->check_cq_overflow)) +	check_cq = READ_ONCE(ctx->check_cq); +	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))  		__io_cqring_overflow_flush(ctx, false);  	if (io_cqring_events(ctx)) -		goto out; +		return 0; + +	/* +	 * Similarly do not spin if we have not informed the user of any +	 * dropped CQE. +	 */ +	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) +		return -EBADR; +  	do {  		/*  		 * If a submit got punted to a workqueue, we can have the @@ -2936,8 +3377,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)  		nr_events += ret;  		ret = 0;  	} while (nr_events < min && !need_resched()); -out: -	mutex_unlock(&ctx->uring_lock); +  	return ret;  } @@ -3010,21 +3450,21 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)  	} else {  		fsnotify_access(req->file);  	} -	if (unlikely(res != req->result)) { +	if (unlikely(res != req->cqe.res)) {  		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&  		    io_rw_should_reissue(req)) {  			req->flags |= REQ_F_REISSUE;  			return true;  		}  		req_set_fail(req); -		req->result = res; +		req->cqe.res = res;  	}  	return false;  }  static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)  { -	int res = req->result; +	int res = req->cqe.res;  	if (*locked) {  		io_req_complete_state(req, res, io_put_kbuf(req, 0)); @@ -3040,7 +3480,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res,  {  	if (__io_complete_rw_common(req, res))  		return; -	__io_req_complete(req, issue_flags, req->result, +	__io_req_complete(req, issue_flags, req->cqe.res,  				io_put_kbuf(req, issue_flags));  } @@ -3050,9 +3490,9 @@ static void io_complete_rw(struct kiocb *kiocb, long res)  	if (__io_complete_rw_common(req, res))  		return; -	req->result = res; +	req->cqe.res = res;  	req->io_task_work.func = io_req_task_complete; -	io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL)); +	io_req_task_prio_work_add(req);  }  static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) @@ -3061,12 +3501,12 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)  	if (kiocb->ki_flags & IOCB_WRITE)  		kiocb_end_write(req); -	if (unlikely(res != req->result)) { +	if (unlikely(res != req->cqe.res)) {  		if (res == -EAGAIN && io_rw_should_reissue(req)) {  			req->flags |= REQ_F_REISSUE;  			return;  		} -		req->result = res; +		req->cqe.res = res;  	}  	/* order with io_iopoll_complete() checking ->iopoll_completed */ @@ -3176,6 +3616,8 @@ static unsigned int io_file_get_flags(struct file *file)  		res |= FFS_ISREG;  	if (__io_file_supports_nowait(file, mode))  		res |= FFS_NOWAIT; +	if (io_file_need_scm(file)) +		res |= FFS_SCM;  	return res;  } @@ -3186,42 +3628,11 @@ static inline bool io_file_supports_nowait(struct io_kiocb *req)  static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)  { -	struct io_ring_ctx *ctx = req->ctx;  	struct kiocb *kiocb = &req->rw.kiocb; -	struct file *file = req->file;  	unsigned ioprio;  	int ret; -	if (!io_req_ffs_set(req)) -		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; -  	kiocb->ki_pos = READ_ONCE(sqe->off); -	kiocb->ki_flags = iocb_flags(file); -	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); -	if (unlikely(ret)) -		return ret; - -	/* -	 * If the file is marked O_NONBLOCK, still allow retry for it if it -	 * supports async. Otherwise it's impossible to use O_NONBLOCK files -	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry. -	 */ -	if ((kiocb->ki_flags & IOCB_NOWAIT) || -	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) -		req->flags |= REQ_F_NOWAIT; - -	if (ctx->flags & IORING_SETUP_IOPOLL) { -		if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) -			return -EOPNOTSUPP; - -		kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; -		kiocb->ki_complete = io_complete_rw_iopoll; -		req->iopoll_completed = 0; -	} else { -		if (kiocb->ki_flags & IOCB_HIPRI) -			return -EINVAL; -		kiocb->ki_complete = io_complete_rw; -	}  	ioprio = READ_ONCE(sqe->ioprio);  	if (ioprio) { @@ -3237,6 +3648,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	req->imu = NULL;  	req->rw.addr = READ_ONCE(sqe->addr);  	req->rw.len = READ_ONCE(sqe->len); +	req->rw.flags = READ_ONCE(sqe->rw_flags); +	/* used for fixed read/write too - just read unconditionally */  	req->buf_index = READ_ONCE(sqe->buf_index);  	return 0;  } @@ -3265,19 +3678,18 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)  static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)  {  	struct kiocb *kiocb = &req->rw.kiocb; -	bool is_stream = req->file->f_mode & FMODE_STREAM; -	if (kiocb->ki_pos == -1) { -		if (!is_stream) { -			req->flags |= REQ_F_CUR_POS; -			kiocb->ki_pos = req->file->f_pos; -			return &kiocb->ki_pos; -		} else { -			kiocb->ki_pos = 0; -			return NULL; -		} +	if (kiocb->ki_pos != -1) +		return &kiocb->ki_pos; + +	if (!(req->file->f_mode & FMODE_STREAM)) { +		req->flags |= REQ_F_CUR_POS; +		kiocb->ki_pos = req->file->f_pos; +		return &kiocb->ki_pos;  	} -	return is_stream ? NULL : &kiocb->ki_pos; + +	kiocb->ki_pos = 0; +	return NULL;  }  static void kiocb_done(struct io_kiocb *req, ssize_t ret, @@ -3367,7 +3779,8 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter  	return 0;  } -static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) +static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, +			   unsigned int issue_flags)  {  	struct io_mapped_ubuf *imu = req->imu;  	u16 index, buf_index = req->buf_index; @@ -3377,7 +3790,7 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)  		if (unlikely(buf_index >= ctx->nr_user_bufs))  			return -EFAULT; -		io_req_set_rsrc_node(req, ctx); +		io_req_set_rsrc_node(req, ctx, issue_flags);  		index = array_index_nospec(buf_index, ctx->nr_user_bufs);  		imu = READ_ONCE(ctx->user_bufs[index]);  		req->imu = imu; @@ -3385,77 +3798,96 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)  	return __io_import_fixed(req, rw, iter, imu);  } -static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) -{ -	if (needs_lock) -		mutex_unlock(&ctx->uring_lock); -} - -static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) +static int io_buffer_add_list(struct io_ring_ctx *ctx, +			      struct io_buffer_list *bl, unsigned int bgid)  { -	/* -	 * "Normal" inline submissions always hold the uring_lock, since we -	 * grab it from the system call. Same is true for the SQPOLL offload. -	 * The only exception is when we've detached the request and issue it -	 * from an async worker thread, grab the lock for that case. -	 */ -	if (needs_lock) -		mutex_lock(&ctx->uring_lock); -} - -static void io_buffer_add_list(struct io_ring_ctx *ctx, -			       struct io_buffer_list *bl, unsigned int bgid) -{ -	struct list_head *list; - -	list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; -	INIT_LIST_HEAD(&bl->buf_list);  	bl->bgid = bgid; -	list_add(&bl->list, list); +	if (bgid < BGID_ARRAY) +		return 0; + +	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));  } -static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, -					  int bgid, unsigned int issue_flags) +static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, +					      struct io_buffer_list *bl)  { -	struct io_buffer *kbuf = req->kbuf; -	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; -	struct io_ring_ctx *ctx = req->ctx; -	struct io_buffer_list *bl; - -	if (req->flags & REQ_F_BUFFER_SELECTED) -		return kbuf; +	if (!list_empty(&bl->buf_list)) { +		struct io_buffer *kbuf; -	io_ring_submit_lock(ctx, needs_lock); - -	lockdep_assert_held(&ctx->uring_lock); - -	bl = io_buffer_get_list(ctx, bgid); -	if (bl && !list_empty(&bl->buf_list)) {  		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);  		list_del(&kbuf->list);  		if (*len > kbuf->len)  			*len = kbuf->len;  		req->flags |= REQ_F_BUFFER_SELECTED;  		req->kbuf = kbuf; +		req->buf_index = kbuf->bid; +		return u64_to_user_ptr(kbuf->addr); +	} +	return NULL; +} + +static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, +					  struct io_buffer_list *bl, +					  unsigned int issue_flags) +{ +	struct io_uring_buf_ring *br = bl->buf_ring; +	struct io_uring_buf *buf; +	__u32 head = bl->head; + +	if (unlikely(smp_load_acquire(&br->tail) == head)) { +		io_ring_submit_unlock(req->ctx, issue_flags); +		return NULL; +	} + +	head &= bl->mask; +	if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { +		buf = &br->bufs[head];  	} else { -		kbuf = ERR_PTR(-ENOBUFS); +		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); +		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1; +		buf = page_address(bl->buf_pages[index]); +		buf += off;  	} +	if (*len > buf->len) +		*len = buf->len; +	req->flags |= REQ_F_BUFFER_RING; +	req->buf_list = bl; +	req->buf_index = buf->bid; -	io_ring_submit_unlock(req->ctx, needs_lock); -	return kbuf; +	if (issue_flags & IO_URING_F_UNLOCKED) { +		/* +		 * If we came in unlocked, we have no choice but to consume the +		 * buffer here. This does mean it'll be pinned until the IO +		 * completes. But coming in unlocked means we're in io-wq +		 * context, hence there should be no further retry. For the +		 * locked case, the caller must ensure to call the commit when +		 * the transfer completes (or if we get -EAGAIN and must poll +		 * or retry). +		 */ +		req->buf_list = NULL; +		bl->head++; +	} +	return u64_to_user_ptr(buf->addr);  } -static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, -					unsigned int issue_flags) +static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, +				     unsigned int issue_flags)  { -	struct io_buffer *kbuf; -	u16 bgid; +	struct io_ring_ctx *ctx = req->ctx; +	struct io_buffer_list *bl; +	void __user *ret = NULL; + +	io_ring_submit_lock(req->ctx, issue_flags); -	bgid = req->buf_index; -	kbuf = io_buffer_select(req, len, bgid, issue_flags); -	if (IS_ERR(kbuf)) -		return kbuf; -	return u64_to_user_ptr(kbuf->addr); +	bl = io_buffer_get_list(ctx, req->buf_index); +	if (likely(bl)) { +		if (bl->buf_nr_pages) +			ret = io_ring_buffer_select(req, len, bl, issue_flags); +		else +			ret = io_provided_buffer_select(req, len, bl); +	} +	io_ring_submit_unlock(req->ctx, issue_flags); +	return ret;  }  #ifdef CONFIG_COMPAT @@ -3465,7 +3897,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,  	struct compat_iovec __user *uiov;  	compat_ssize_t clen;  	void __user *buf; -	ssize_t len; +	size_t len;  	uiov = u64_to_user_ptr(req->rw.addr);  	if (!access_ok(uiov, sizeof(*uiov))) @@ -3476,11 +3908,12 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,  		return -EINVAL;  	len = clen; -	buf = io_rw_buffer_select(req, &len, issue_flags); -	if (IS_ERR(buf)) -		return PTR_ERR(buf); +	buf = io_buffer_select(req, &len, issue_flags); +	if (!buf) +		return -ENOBUFS; +	req->rw.addr = (unsigned long) buf;  	iov[0].iov_base = buf; -	iov[0].iov_len = (compat_size_t) len; +	req->rw.len = iov[0].iov_len = (compat_size_t) len;  	return 0;  }  #endif @@ -3498,22 +3931,21 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,  	len = iov[0].iov_len;  	if (len < 0)  		return -EINVAL; -	buf = io_rw_buffer_select(req, &len, issue_flags); -	if (IS_ERR(buf)) -		return PTR_ERR(buf); +	buf = io_buffer_select(req, &len, issue_flags); +	if (!buf) +		return -ENOBUFS; +	req->rw.addr = (unsigned long) buf;  	iov[0].iov_base = buf; -	iov[0].iov_len = len; +	req->rw.len = iov[0].iov_len = len;  	return 0;  }  static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,  				    unsigned int issue_flags)  { -	if (req->flags & REQ_F_BUFFER_SELECTED) { -		struct io_buffer *kbuf = req->kbuf; - -		iov[0].iov_base = u64_to_user_ptr(kbuf->addr); -		iov[0].iov_len = kbuf->len; +	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { +		iov[0].iov_base = u64_to_user_ptr(req->rw.addr); +		iov[0].iov_len = req->rw.len;  		return 0;  	}  	if (req->rw.len != 1) @@ -3527,6 +3959,13 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,  	return __io_iov_buffer_select(req, iov, issue_flags);  } +static inline bool io_do_buffer_select(struct io_kiocb *req) +{ +	if (!(req->flags & REQ_F_BUFFER_SELECT)) +		return false; +	return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); +} +  static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,  				       struct io_rw_state *s,  				       unsigned int issue_flags) @@ -3539,24 +3978,21 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,  	ssize_t ret;  	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { -		ret = io_import_fixed(req, rw, iter); +		ret = io_import_fixed(req, rw, iter, issue_flags);  		if (ret)  			return ERR_PTR(ret);  		return NULL;  	} -	/* buffer index only valid with fixed read/write, or buffer select  */ -	if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) -		return ERR_PTR(-EINVAL); -  	buf = u64_to_user_ptr(req->rw.addr);  	sqe_len = req->rw.len;  	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { -		if (req->flags & REQ_F_BUFFER_SELECT) { -			buf = io_rw_buffer_select(req, &sqe_len, issue_flags); -			if (IS_ERR(buf)) -				return ERR_CAST(buf); +		if (io_do_buffer_select(req)) { +			buf = io_buffer_select(req, &sqe_len, issue_flags); +			if (!buf) +				return ERR_PTR(-ENOBUFS); +			req->rw.addr = (unsigned long) buf;  			req->rw.len = sqe_len;  		} @@ -3740,13 +4176,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)  	return 0;  } -static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ -	if (unlikely(!(req->file->f_mode & FMODE_READ))) -		return -EBADF; -	return io_prep_rw(req, sqe); -} -  /*   * This is our waitqueue callback handler, registered through __folio_lock_async()   * when we initially tried to do the IO with the iocb armed our waitqueue. @@ -3834,6 +4263,50 @@ static bool need_read_all(struct io_kiocb *req)  		S_ISBLK(file_inode(req->file)->i_mode);  } +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +{ +	struct kiocb *kiocb = &req->rw.kiocb; +	struct io_ring_ctx *ctx = req->ctx; +	struct file *file = req->file; +	int ret; + +	if (unlikely(!file || !(file->f_mode & mode))) +		return -EBADF; + +	if (!io_req_ffs_set(req)) +		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; + +	kiocb->ki_flags = iocb_flags(file); +	ret = kiocb_set_rw_flags(kiocb, req->rw.flags); +	if (unlikely(ret)) +		return ret; + +	/* +	 * If the file is marked O_NONBLOCK, still allow retry for it if it +	 * supports async. Otherwise it's impossible to use O_NONBLOCK files +	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry. +	 */ +	if ((kiocb->ki_flags & IOCB_NOWAIT) || +	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) +		req->flags |= REQ_F_NOWAIT; + +	if (ctx->flags & IORING_SETUP_IOPOLL) { +		if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) +			return -EOPNOTSUPP; + +		kiocb->private = NULL; +		kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; +		kiocb->ki_complete = io_complete_rw_iopoll; +		req->iopoll_completed = 0; +	} else { +		if (kiocb->ki_flags & IOCB_HIPRI) +			return -EINVAL; +		kiocb->ki_complete = io_complete_rw; +	} + +	return 0; +} +  static int io_read(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_rw_state __s, *s = &__s; @@ -3869,7 +4342,12 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)  		iov_iter_restore(&s->iter, &s->iter_state);  		iovec = NULL;  	} -	req->result = iov_iter_count(&s->iter); +	ret = io_rw_init_file(req, FMODE_READ); +	if (unlikely(ret)) { +		kfree(iovec); +		return ret; +	} +	req->cqe.res = iov_iter_count(&s->iter);  	if (force_nonblock) {  		/* If the file doesn't support async, just async punt */ @@ -3885,7 +4363,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)  	ppos = io_kiocb_update_pos(req); -	ret = rw_verify_area(READ, req->file, ppos, req->result); +	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);  	if (unlikely(ret)) {  		kfree(iovec);  		return ret; @@ -3907,7 +4385,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)  		ret = 0;  	} else if (ret == -EIOCBQUEUED) {  		goto out_free; -	} else if (ret == req->result || ret <= 0 || !force_nonblock || +	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||  		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {  		/* read all, failed, already did sync or don't want to retry */  		goto done; @@ -3972,13 +4450,6 @@ out_free:  	return 0;  } -static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ -	if (unlikely(!(req->file->f_mode & FMODE_WRITE))) -		return -EBADF; -	return io_prep_rw(req, sqe); -} -  static int io_write(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_rw_state __s, *s = &__s; @@ -3999,7 +4470,12 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)  		iov_iter_restore(&s->iter, &s->iter_state);  		iovec = NULL;  	} -	req->result = iov_iter_count(&s->iter); +	ret = io_rw_init_file(req, FMODE_WRITE); +	if (unlikely(ret)) { +		kfree(iovec); +		return ret; +	} +	req->cqe.res = iov_iter_count(&s->iter);  	if (force_nonblock) {  		/* If the file doesn't support async, just async punt */ @@ -4019,7 +4495,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)  	ppos = io_kiocb_update_pos(req); -	ret = rw_verify_area(WRITE, req->file, ppos, req->result); +	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);  	if (unlikely(ret))  		goto out_free; @@ -4083,9 +4559,7 @@ static int io_renameat_prep(struct io_kiocb *req,  	struct io_rename *ren = &req->rename;  	const char __user *oldf, *newf; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) +	if (sqe->buf_index || sqe->splice_fd_in)  		return -EINVAL;  	if (unlikely(req->flags & REQ_F_FIXED_FILE))  		return -EBADF; @@ -4122,22 +4596,257 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)  				ren->newpath, ren->flags);  	req->flags &= ~REQ_F_NEED_CLEANUP; -	if (ret < 0) -		req_set_fail(req);  	io_req_complete(req, ret);  	return 0;  } +static inline void __io_xattr_finish(struct io_kiocb *req) +{ +	struct io_xattr *ix = &req->xattr; + +	if (ix->filename) +		putname(ix->filename); + +	kfree(ix->ctx.kname); +	kvfree(ix->ctx.kvalue); +} + +static void io_xattr_finish(struct io_kiocb *req, int ret) +{ +	req->flags &= ~REQ_F_NEED_CLEANUP; + +	__io_xattr_finish(req); +	io_req_complete(req, ret); +} + +static int __io_getxattr_prep(struct io_kiocb *req, +			      const struct io_uring_sqe *sqe) +{ +	struct io_xattr *ix = &req->xattr; +	const char __user *name; +	int ret; + +	if (unlikely(req->flags & REQ_F_FIXED_FILE)) +		return -EBADF; + +	ix->filename = NULL; +	ix->ctx.kvalue = NULL; +	name = u64_to_user_ptr(READ_ONCE(sqe->addr)); +	ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); +	ix->ctx.size = READ_ONCE(sqe->len); +	ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + +	if (ix->ctx.flags) +		return -EINVAL; + +	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); +	if (!ix->ctx.kname) +		return -ENOMEM; + +	ret = strncpy_from_user(ix->ctx.kname->name, name, +				sizeof(ix->ctx.kname->name)); +	if (!ret || ret == sizeof(ix->ctx.kname->name)) +		ret = -ERANGE; +	if (ret < 0) { +		kfree(ix->ctx.kname); +		return ret; +	} + +	req->flags |= REQ_F_NEED_CLEANUP; +	return 0; +} + +static int io_fgetxattr_prep(struct io_kiocb *req, +			     const struct io_uring_sqe *sqe) +{ +	return __io_getxattr_prep(req, sqe); +} + +static int io_getxattr_prep(struct io_kiocb *req, +			    const struct io_uring_sqe *sqe) +{ +	struct io_xattr *ix = &req->xattr; +	const char __user *path; +	int ret; + +	ret = __io_getxattr_prep(req, sqe); +	if (ret) +		return ret; + +	path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + +	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); +	if (IS_ERR(ix->filename)) { +		ret = PTR_ERR(ix->filename); +		ix->filename = NULL; +	} + +	return ret; +} + +static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ +	struct io_xattr *ix = &req->xattr; +	int ret; + +	if (issue_flags & IO_URING_F_NONBLOCK) +		return -EAGAIN; + +	ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), +			req->file->f_path.dentry, +			&ix->ctx); + +	io_xattr_finish(req, ret); +	return 0; +} + +static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) +{ +	struct io_xattr *ix = &req->xattr; +	unsigned int lookup_flags = LOOKUP_FOLLOW; +	struct path path; +	int ret; + +	if (issue_flags & IO_URING_F_NONBLOCK) +		return -EAGAIN; + +retry: +	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); +	if (!ret) { +		ret = do_getxattr(mnt_user_ns(path.mnt), +				path.dentry, +				&ix->ctx); + +		path_put(&path); +		if (retry_estale(ret, lookup_flags)) { +			lookup_flags |= LOOKUP_REVAL; +			goto retry; +		} +	} + +	io_xattr_finish(req, ret); +	return 0; +} + +static int __io_setxattr_prep(struct io_kiocb *req, +			const struct io_uring_sqe *sqe) +{ +	struct io_xattr *ix = &req->xattr; +	const char __user *name; +	int ret; + +	if (unlikely(req->flags & REQ_F_FIXED_FILE)) +		return -EBADF; + +	ix->filename = NULL; +	name = u64_to_user_ptr(READ_ONCE(sqe->addr)); +	ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); +	ix->ctx.kvalue = NULL; +	ix->ctx.size = READ_ONCE(sqe->len); +	ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + +	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); +	if (!ix->ctx.kname) +		return -ENOMEM; + +	ret = setxattr_copy(name, &ix->ctx); +	if (ret) { +		kfree(ix->ctx.kname); +		return ret; +	} + +	req->flags |= REQ_F_NEED_CLEANUP; +	return 0; +} + +static int io_setxattr_prep(struct io_kiocb *req, +			const struct io_uring_sqe *sqe) +{ +	struct io_xattr *ix = &req->xattr; +	const char __user *path; +	int ret; + +	ret = __io_setxattr_prep(req, sqe); +	if (ret) +		return ret; + +	path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + +	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); +	if (IS_ERR(ix->filename)) { +		ret = PTR_ERR(ix->filename); +		ix->filename = NULL; +	} + +	return ret; +} + +static int io_fsetxattr_prep(struct io_kiocb *req, +			const struct io_uring_sqe *sqe) +{ +	return __io_setxattr_prep(req, sqe); +} + +static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, +			struct path *path) +{ +	struct io_xattr *ix = &req->xattr; +	int ret; + +	ret = mnt_want_write(path->mnt); +	if (!ret) { +		ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx); +		mnt_drop_write(path->mnt); +	} + +	return ret; +} + +static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ +	int ret; + +	if (issue_flags & IO_URING_F_NONBLOCK) +		return -EAGAIN; + +	ret = __io_setxattr(req, issue_flags, &req->file->f_path); +	io_xattr_finish(req, ret); + +	return 0; +} + +static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) +{ +	struct io_xattr *ix = &req->xattr; +	unsigned int lookup_flags = LOOKUP_FOLLOW; +	struct path path; +	int ret; + +	if (issue_flags & IO_URING_F_NONBLOCK) +		return -EAGAIN; + +retry: +	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); +	if (!ret) { +		ret = __io_setxattr(req, issue_flags, &path); +		path_put(&path); +		if (retry_estale(ret, lookup_flags)) { +			lookup_flags |= LOOKUP_REVAL; +			goto retry; +		} +	} + +	io_xattr_finish(req, ret); +	return 0; +} +  static int io_unlinkat_prep(struct io_kiocb *req,  			    const struct io_uring_sqe *sqe)  {  	struct io_unlink *un = &req->unlink;  	const char __user *fname; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || -	    sqe->splice_fd_in) +	if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)  		return -EINVAL;  	if (unlikely(req->flags & REQ_F_FIXED_FILE))  		return -EBADF; @@ -4171,8 +4880,6 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)  		ret = do_unlinkat(un->dfd, un->filename);  	req->flags &= ~REQ_F_NEED_CLEANUP; -	if (ret < 0) -		req_set_fail(req);  	io_req_complete(req, ret);  	return 0;  } @@ -4183,10 +4890,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,  	struct io_mkdir *mkd = &req->mkdir;  	const char __user *fname; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index || -	    sqe->splice_fd_in) +	if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)  		return -EINVAL;  	if (unlikely(req->flags & REQ_F_FIXED_FILE))  		return -EBADF; @@ -4214,8 +4918,6 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)  	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);  	req->flags &= ~REQ_F_NEED_CLEANUP; -	if (ret < 0) -		req_set_fail(req);  	io_req_complete(req, ret);  	return 0;  } @@ -4226,10 +4928,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,  	struct io_symlink *sl = &req->symlink;  	const char __user *oldpath, *newpath; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index || -	    sqe->splice_fd_in) +	if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)  		return -EINVAL;  	if (unlikely(req->flags & REQ_F_FIXED_FILE))  		return -EBADF; @@ -4263,8 +4962,6 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)  	ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);  	req->flags &= ~REQ_F_NEED_CLEANUP; -	if (ret < 0) -		req_set_fail(req);  	io_req_complete(req, ret);  	return 0;  } @@ -4275,9 +4972,7 @@ static int io_linkat_prep(struct io_kiocb *req,  	struct io_hardlink *lnk = &req->hardlink;  	const char __user *oldf, *newf; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) +	if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)  		return -EINVAL;  	if (unlikely(req->flags & REQ_F_FIXED_FILE))  		return -EBADF; @@ -4314,9 +5009,97 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)  				lnk->newpath, lnk->flags);  	req->flags &= ~REQ_F_NEED_CLEANUP; +	io_req_complete(req, ret); +	return 0; +} + +static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +{ +	req->uring_cmd.task_work_cb(&req->uring_cmd); +} + +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, +			void (*task_work_cb)(struct io_uring_cmd *)) +{ +	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + +	req->uring_cmd.task_work_cb = task_work_cb; +	req->io_task_work.func = io_uring_cmd_work; +	io_req_task_prio_work_add(req); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); + +/* + * Called by consumers of io_uring_cmd, if they originally returned + * -EIOCBQUEUED upon receiving the command. + */ +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) +{ +	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); +  	if (ret < 0)  		req_set_fail(req); -	io_req_complete(req, ret); +	if (req->ctx->flags & IORING_SETUP_CQE32) +		__io_req_complete32(req, 0, ret, 0, res2, 0); +	else +		io_req_complete(req, ret); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_done); + +static int io_uring_cmd_prep_async(struct io_kiocb *req) +{ +	size_t cmd_size; + +	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); + +	memcpy(req->async_data, req->uring_cmd.cmd, cmd_size); +	return 0; +} + +static int io_uring_cmd_prep(struct io_kiocb *req, +			     const struct io_uring_sqe *sqe) +{ +	struct io_uring_cmd *ioucmd = &req->uring_cmd; + +	if (sqe->rw_flags) +		return -EINVAL; +	ioucmd->cmd = sqe->cmd; +	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); +	return 0; +} + +static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) +{ +	struct io_uring_cmd *ioucmd = &req->uring_cmd; +	struct io_ring_ctx *ctx = req->ctx; +	struct file *file = req->file; +	int ret; + +	if (!req->file->f_op->uring_cmd) +		return -EOPNOTSUPP; + +	if (ctx->flags & IORING_SETUP_SQE128) +		issue_flags |= IO_URING_F_SQE128; +	if (ctx->flags & IORING_SETUP_CQE32) +		issue_flags |= IO_URING_F_CQE32; +	if (ctx->flags & IORING_SETUP_IOPOLL) +		issue_flags |= IO_URING_F_IOPOLL; + +	if (req_has_async_data(req)) +		ioucmd->cmd = req->async_data; + +	ret = file->f_op->uring_cmd(ioucmd, issue_flags); +	if (ret == -EAGAIN) { +		if (!req_has_async_data(req)) { +			if (io_alloc_async_data(req)) +				return -ENOMEM; +			io_uring_cmd_prep_async(req); +		} +		return -EAGAIN; +	} + +	if (ret != -EIOCBQUEUED) +		io_uring_cmd_done(ioucmd, ret, 0);  	return 0;  } @@ -4324,9 +5107,7 @@ static int io_shutdown_prep(struct io_kiocb *req,  			    const struct io_uring_sqe *sqe)  {  #if defined(CONFIG_NET) -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || +	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||  		     sqe->buf_index || sqe->splice_fd_in))  		return -EINVAL; @@ -4351,8 +5132,6 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)  		return -ENOTSOCK;  	ret = __sys_shutdown_sock(sock, req->shutdown.how); -	if (ret < 0) -		req_set_fail(req);  	io_req_complete(req, ret);  	return 0;  #else @@ -4366,21 +5145,11 @@ static int __io_splice_prep(struct io_kiocb *req,  	struct io_splice *sp = &req->splice;  	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; - -	sp->file_in = NULL;  	sp->len = READ_ONCE(sqe->len);  	sp->flags = READ_ONCE(sqe->splice_flags); -  	if (unlikely(sp->flags & ~valid_flags))  		return -EINVAL; - -	sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in), -				  (sp->flags & SPLICE_F_FD_IN_FIXED)); -	if (!sp->file_in) -		return -EBADF; -	req->flags |= REQ_F_NEED_CLEANUP; +	sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);  	return 0;  } @@ -4395,23 +5164,32 @@ static int io_tee_prep(struct io_kiocb *req,  static int io_tee(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_splice *sp = &req->splice; -	struct file *in = sp->file_in;  	struct file *out = sp->file_out;  	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; +	struct file *in;  	long ret = 0;  	if (issue_flags & IO_URING_F_NONBLOCK)  		return -EAGAIN; + +	if (sp->flags & SPLICE_F_FD_IN_FIXED) +		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); +	else +		in = io_file_get_normal(req, sp->splice_fd_in); +	if (!in) { +		ret = -EBADF; +		goto done; +	} +  	if (sp->len)  		ret = do_tee(in, out, sp->len, flags);  	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))  		io_put_file(in); -	req->flags &= ~REQ_F_NEED_CLEANUP; - +done:  	if (ret != sp->len)  		req_set_fail(req); -	io_req_complete(req, ret); +	__io_req_complete(req, 0, ret, 0);  	return 0;  } @@ -4427,15 +5205,24 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  static int io_splice(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_splice *sp = &req->splice; -	struct file *in = sp->file_in;  	struct file *out = sp->file_out;  	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;  	loff_t *poff_in, *poff_out; +	struct file *in;  	long ret = 0;  	if (issue_flags & IO_URING_F_NONBLOCK)  		return -EAGAIN; +	if (sp->flags & SPLICE_F_FD_IN_FIXED) +		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); +	else +		in = io_file_get_normal(req, sp->splice_fd_in); +	if (!in) { +		ret = -EBADF; +		goto done; +	} +  	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;  	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; @@ -4444,11 +5231,23 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)  	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))  		io_put_file(in); -	req->flags &= ~REQ_F_NEED_CLEANUP; - +done:  	if (ret != sp->len)  		req_set_fail(req); -	io_req_complete(req, ret); +	__io_req_complete(req, 0, ret, 0); +	return 0; +} + +static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ +	/* +	 * If the ring is setup with CQE32, relay back addr/addr +	 */ +	if (req->ctx->flags & IORING_SETUP_CQE32) { +		req->nop.extra1 = READ_ONCE(sqe->addr); +		req->nop.extra2 = READ_ONCE(sqe->addr2); +	} +  	return 0;  } @@ -4457,20 +5256,31 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)   */  static int io_nop(struct io_kiocb *req, unsigned int issue_flags)  { -	struct io_ring_ctx *ctx = req->ctx; +	unsigned int cflags; +	void __user *buf; -	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; +	if (req->flags & REQ_F_BUFFER_SELECT) { +		size_t len = 1; + +		buf = io_buffer_select(req, &len, issue_flags); +		if (!buf) +			return -ENOBUFS; +	} -	__io_req_complete(req, issue_flags, 0, 0); +	cflags = io_put_kbuf(req, issue_flags); +	if (!(req->ctx->flags & IORING_SETUP_CQE32)) +		__io_req_complete(req, issue_flags, 0, cflags); +	else +		__io_req_complete32(req, issue_flags, 0, cflags, +				    req->nop.extra1, req->nop.extra2);  	return 0;  }  static int io_msg_ring_prep(struct io_kiocb *req,  			    const struct io_uring_sqe *sqe)  { -	if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags || -		     sqe->splice_fd_in || sqe->buf_index || sqe->personality)) +	if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || +		     sqe->buf_index || sqe->personality))  		return -EINVAL;  	req->msg.user_data = READ_ONCE(sqe->off); @@ -4506,20 +5316,15 @@ done:  	if (ret < 0)  		req_set_fail(req);  	__io_req_complete(req, issue_flags, ret, 0); +	/* put file to avoid an attempt to IOPOLL the req */ +	io_put_file(req->file); +	req->file = NULL;  	return 0;  }  static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  { -	struct io_ring_ctx *ctx = req->ctx; - -	if (!req->file) -		return -EBADF; - -	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || -		     sqe->splice_fd_in)) +	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))  		return -EINVAL;  	req->sync.flags = READ_ONCE(sqe->fsync_flags); @@ -4543,8 +5348,6 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)  	ret = vfs_fsync_range(req->file, req->sync.off,  				end > 0 ? end : LLONG_MAX,  				req->sync.flags & IORING_FSYNC_DATASYNC); -	if (ret < 0) -		req_set_fail(req);  	io_req_complete(req, ret);  	return 0;  } @@ -4552,10 +5355,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)  static int io_fallocate_prep(struct io_kiocb *req,  			     const struct io_uring_sqe *sqe)  { -	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || -	    sqe->splice_fd_in) -		return -EINVAL; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) +	if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)  		return -EINVAL;  	req->sync.off = READ_ONCE(sqe->off); @@ -4573,9 +5373,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)  		return -EAGAIN;  	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,  				req->sync.len); -	if (ret < 0) -		req_set_fail(req); -	else +	if (ret >= 0)  		fsnotify_modify(req->file);  	io_req_complete(req, ret);  	return 0; @@ -4586,9 +5384,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe  	const char __user *fname;  	int ret; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (unlikely(sqe->ioprio || sqe->buf_index)) +	if (unlikely(sqe->buf_index))  		return -EINVAL;  	if (unlikely(req->flags & REQ_F_FIXED_FILE))  		return -EBADF; @@ -4643,6 +5439,61 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	return __io_openat_prep(req, sqe);  } +static int io_file_bitmap_get(struct io_ring_ctx *ctx) +{ +	struct io_file_table *table = &ctx->file_table; +	unsigned long nr = ctx->nr_user_files; +	int ret; + +	if (table->alloc_hint >= nr) +		table->alloc_hint = 0; + +	do { +		ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); +		if (ret != nr) { +			table->alloc_hint = ret + 1; +			return ret; +		} +		if (!table->alloc_hint) +			break; + +		nr = table->alloc_hint; +		table->alloc_hint = 0; +	} while (1); + +	return -ENFILE; +} + +static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, +			       struct file *file, unsigned int file_slot) +{ +	bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; +	struct io_ring_ctx *ctx = req->ctx; +	int ret; + +	if (alloc_slot) { +		io_ring_submit_lock(ctx, issue_flags); +		ret = io_file_bitmap_get(ctx); +		if (unlikely(ret < 0)) { +			io_ring_submit_unlock(ctx, issue_flags); +			return ret; +		} + +		file_slot = ret; +	} else { +		file_slot--; +	} + +	ret = io_install_fixed_file(req, file, issue_flags, file_slot); +	if (alloc_slot) { +		io_ring_submit_unlock(ctx, issue_flags); +		if (!ret) +			return file_slot; +	} + +	return ret; +} +  static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)  {  	struct open_flags op; @@ -4698,8 +5549,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)  	if (!fixed)  		fd_install(ret, file);  	else -		ret = io_install_fixed_file(req, file, issue_flags, -					    req->open.file_slot - 1); +		ret = io_fixed_fd_install(req, issue_flags, file, +						req->open.file_slot);  err:  	putname(req->open.filename);  	req->flags &= ~REQ_F_NEED_CLEANUP; @@ -4720,7 +5571,7 @@ static int io_remove_buffers_prep(struct io_kiocb *req,  	struct io_provide_buf *p = &req->pbuf;  	u64 tmp; -	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || +	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||  	    sqe->splice_fd_in)  		return -EINVAL; @@ -4743,6 +5594,20 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,  	if (!nbufs)  		return 0; +	if (bl->buf_nr_pages) { +		int j; + +		i = bl->buf_ring->tail - bl->head; +		for (j = 0; j < bl->buf_nr_pages; j++) +			unpin_user_page(bl->buf_pages[j]); +		kvfree(bl->buf_pages); +		bl->buf_pages = NULL; +		bl->buf_nr_pages = 0; +		/* make sure it's seen as empty */ +		INIT_LIST_HEAD(&bl->buf_list); +		return i; +	} +  	/* the head kbuf is the list itself */  	while (!list_empty(&bl->buf_list)) {  		struct io_buffer *nxt; @@ -4764,22 +5629,23 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)  	struct io_ring_ctx *ctx = req->ctx;  	struct io_buffer_list *bl;  	int ret = 0; -	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; -	io_ring_submit_lock(ctx, needs_lock); - -	lockdep_assert_held(&ctx->uring_lock); +	io_ring_submit_lock(ctx, issue_flags);  	ret = -ENOENT;  	bl = io_buffer_get_list(ctx, p->bgid); -	if (bl) -		ret = __io_remove_buffers(ctx, bl, p->nbufs); +	if (bl) { +		ret = -EINVAL; +		/* can't use provide/remove buffers command on mapped buffers */ +		if (!bl->buf_nr_pages) +			ret = __io_remove_buffers(ctx, bl, p->nbufs); +	}  	if (ret < 0)  		req_set_fail(req);  	/* complete before unlock, IOPOLL may need the lock */  	__io_req_complete(req, issue_flags, ret, 0); -	io_ring_submit_unlock(ctx, needs_lock); +	io_ring_submit_unlock(ctx, issue_flags);  	return 0;  } @@ -4790,7 +5656,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,  	struct io_provide_buf *p = &req->pbuf;  	u64 tmp; -	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) +	if (sqe->rw_flags || sqe->splice_fd_in)  		return -EINVAL;  	tmp = READ_ONCE(sqe->fd); @@ -4887,26 +5753,56 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,  	return i ? 0 : -ENOMEM;  } +static __cold int io_init_bl_list(struct io_ring_ctx *ctx) +{ +	int i; + +	ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), +				GFP_KERNEL); +	if (!ctx->io_bl) +		return -ENOMEM; + +	for (i = 0; i < BGID_ARRAY; i++) { +		INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); +		ctx->io_bl[i].bgid = i; +	} + +	return 0; +} +  static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_provide_buf *p = &req->pbuf;  	struct io_ring_ctx *ctx = req->ctx;  	struct io_buffer_list *bl;  	int ret = 0; -	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; -	io_ring_submit_lock(ctx, needs_lock); +	io_ring_submit_lock(ctx, issue_flags); -	lockdep_assert_held(&ctx->uring_lock); +	if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { +		ret = io_init_bl_list(ctx); +		if (ret) +			goto err; +	}  	bl = io_buffer_get_list(ctx, p->bgid);  	if (unlikely(!bl)) { -		bl = kmalloc(sizeof(*bl), GFP_KERNEL); +		bl = kzalloc(sizeof(*bl), GFP_KERNEL);  		if (!bl) {  			ret = -ENOMEM;  			goto err;  		} -		io_buffer_add_list(ctx, bl, p->bgid); +		INIT_LIST_HEAD(&bl->buf_list); +		ret = io_buffer_add_list(ctx, bl, p->bgid); +		if (ret) { +			kfree(bl); +			goto err; +		} +	} +	/* can't add buffers via this command for a mapped buffer ring */ +	if (bl->buf_nr_pages) { +		ret = -EINVAL; +		goto err;  	}  	ret = io_add_buffers(ctx, p, bl); @@ -4915,7 +5811,7 @@ err:  		req_set_fail(req);  	/* complete before unlock, IOPOLL may need the lock */  	__io_req_complete(req, issue_flags, ret, 0); -	io_ring_submit_unlock(ctx, needs_lock); +	io_ring_submit_unlock(ctx, issue_flags);  	return 0;  } @@ -4923,9 +5819,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,  			     const struct io_uring_sqe *sqe)  {  #if defined(CONFIG_EPOLL) -	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) -		return -EINVAL; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) +	if (sqe->buf_index || sqe->splice_fd_in)  		return -EINVAL;  	req->epoll.epfd = READ_ONCE(sqe->fd); @@ -4969,9 +5863,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)  static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) -	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) -		return -EINVAL; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) +	if (sqe->buf_index || sqe->off || sqe->splice_fd_in)  		return -EINVAL;  	req->madvise.addr = READ_ONCE(sqe->addr); @@ -4993,8 +5885,6 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)  		return -EAGAIN;  	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); -	if (ret < 0) -		req_set_fail(req);  	io_req_complete(req, ret);  	return 0;  #else @@ -5004,9 +5894,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)  static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  { -	if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) -		return -EINVAL; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) +	if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)  		return -EINVAL;  	req->fadvise.offset = READ_ONCE(sqe->off); @@ -5042,9 +5930,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	const char __user *path; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) +	if (sqe->buf_index || sqe->splice_fd_in)  		return -EINVAL;  	if (req->flags & REQ_F_FIXED_FILE)  		return -EBADF; @@ -5080,19 +5966,13 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)  	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,  		       ctx->buffer); - -	if (ret < 0) -		req_set_fail(req);  	io_req_complete(req, ret);  	return 0;  }  static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  { -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || -	    sqe->rw_flags || sqe->buf_index) +	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)  		return -EINVAL;  	if (req->flags & REQ_F_FIXED_FILE)  		return -EBADF; @@ -5124,7 +6004,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)  		spin_unlock(&files->file_lock);  		goto err;  	} -	file = fdt->fd[close->fd]; +	file = rcu_dereference_protected(fdt->fd[close->fd], +			lockdep_is_held(&files->file_lock));  	if (!file || file->f_op == &io_uring_fops) {  		spin_unlock(&files->file_lock);  		file = NULL; @@ -5158,12 +6039,7 @@ err:  static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  { -	struct io_ring_ctx *ctx = req->ctx; - -	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || -		     sqe->splice_fd_in)) +	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))  		return -EINVAL;  	req->sync.off = READ_ONCE(sqe->off); @@ -5182,13 +6058,18 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)  	ret = sync_file_range(req->file, req->sync.off, req->sync.len,  				req->sync.flags); -	if (ret < 0) -		req_set_fail(req);  	io_req_complete(req, ret);  	return 0;  }  #if defined(CONFIG_NET) +static bool io_net_retry(struct socket *sock, int flags) +{ +	if (!(flags & MSG_WAITALL)) +		return false; +	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; +} +  static int io_setup_async_msg(struct io_kiocb *req,  			      struct io_async_msghdr *kmsg)  { @@ -5234,11 +6115,16 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	struct io_sr_msg *sr = &req->sr_msg; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) +	if (unlikely(sqe->file_index)) +		return -EINVAL; +	if (unlikely(sqe->addr2 || sqe->file_index))  		return -EINVAL;  	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));  	sr->len = READ_ONCE(sqe->len); +	sr->flags = READ_ONCE(sqe->addr2); +	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) +		return -EINVAL;  	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;  	if (sr->msg_flags & MSG_DONTWAIT)  		req->flags |= REQ_F_NOWAIT; @@ -5247,12 +6133,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	if (req->ctx->compat)  		sr->msg_flags |= MSG_CMSG_COMPAT;  #endif +	sr->done_io = 0;  	return 0;  }  static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_async_msghdr iomsg, *kmsg; +	struct io_sr_msg *sr = &req->sr_msg;  	struct socket *sock;  	unsigned flags;  	int min_ret = 0; @@ -5271,7 +6159,11 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)  		kmsg = &iomsg;  	} -	flags = req->sr_msg.msg_flags; +	if (!(req->flags & REQ_F_POLLED) && +	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) +		return io_setup_async_msg(req, kmsg); + +	flags = sr->msg_flags;  	if (issue_flags & IO_URING_F_NONBLOCK)  		flags |= MSG_DONTWAIT;  	if (flags & MSG_WAITALL) @@ -5284,12 +6176,21 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)  			return io_setup_async_msg(req, kmsg);  		if (ret == -ERESTARTSYS)  			ret = -EINTR; +		if (ret > 0 && io_net_retry(sock, flags)) { +			sr->done_io += ret; +			req->flags |= REQ_F_PARTIAL_IO; +			return io_setup_async_msg(req, kmsg); +		}  		req_set_fail(req);  	}  	/* fast path, check for non-NULL to avoid function call */  	if (kmsg->free_iov)  		kfree(kmsg->free_iov);  	req->flags &= ~REQ_F_NEED_CLEANUP; +	if (ret >= 0) +		ret += sr->done_io; +	else if (sr->done_io) +		ret = sr->done_io;  	__io_req_complete(req, issue_flags, ret, 0);  	return 0;  } @@ -5304,6 +6205,10 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)  	int min_ret = 0;  	int ret; +	if (!(req->flags & REQ_F_POLLED) && +	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) +		return -EAGAIN; +  	sock = sock_from_file(req->file);  	if (unlikely(!sock))  		return -ENOTSOCK; @@ -5317,7 +6222,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)  	msg.msg_controllen = 0;  	msg.msg_namelen = 0; -	flags = req->sr_msg.msg_flags; +	flags = sr->msg_flags;  	if (issue_flags & IO_URING_F_NONBLOCK)  		flags |= MSG_DONTWAIT;  	if (flags & MSG_WAITALL) @@ -5330,8 +6235,19 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)  			return -EAGAIN;  		if (ret == -ERESTARTSYS)  			ret = -EINTR; +		if (ret > 0 && io_net_retry(sock, flags)) { +			sr->len -= ret; +			sr->buf += ret; +			sr->done_io += ret; +			req->flags |= REQ_F_PARTIAL_IO; +			return -EAGAIN; +		}  		req_set_fail(req);  	} +	if (ret >= 0) +		ret += sr->done_io; +	else if (sr->done_io) +		ret = sr->done_io;  	__io_req_complete(req, issue_flags, ret, 0);  	return 0;  } @@ -5423,14 +6339,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,  	return __io_recvmsg_copy_hdr(req, iomsg);  } -static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, -					       unsigned int issue_flags) -{ -	struct io_sr_msg *sr = &req->sr_msg; - -	return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); -} -  static int io_recvmsg_prep_async(struct io_kiocb *req)  {  	int ret; @@ -5445,12 +6353,16 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	struct io_sr_msg *sr = &req->sr_msg; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) +	if (unlikely(sqe->file_index)) +		return -EINVAL; +	if (unlikely(sqe->addr2 || sqe->file_index))  		return -EINVAL;  	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));  	sr->len = READ_ONCE(sqe->len); -	sr->bgid = READ_ONCE(sqe->buf_group); +	sr->flags = READ_ONCE(sqe->addr2); +	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) +		return -EINVAL;  	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;  	if (sr->msg_flags & MSG_DONTWAIT)  		req->flags |= REQ_F_NOWAIT; @@ -5463,19 +6375,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	return 0;  } -static bool io_net_retry(struct socket *sock, int flags) -{ -	if (!(flags & MSG_WAITALL)) -		return false; -	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; -} -  static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_async_msghdr iomsg, *kmsg;  	struct io_sr_msg *sr = &req->sr_msg;  	struct socket *sock; -	struct io_buffer *kbuf; +	unsigned int cflags;  	unsigned flags;  	int ret, min_ret = 0;  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -5493,24 +6398,30 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)  		kmsg = &iomsg;  	} -	if (req->flags & REQ_F_BUFFER_SELECT) { -		kbuf = io_recv_buffer_select(req, issue_flags); -		if (IS_ERR(kbuf)) -			return PTR_ERR(kbuf); -		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); -		kmsg->fast_iov[0].iov_len = req->sr_msg.len; -		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, -				1, req->sr_msg.len); +	if (!(req->flags & REQ_F_POLLED) && +	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) +		return io_setup_async_msg(req, kmsg); + +	if (io_do_buffer_select(req)) { +		void __user *buf; + +		buf = io_buffer_select(req, &sr->len, issue_flags); +		if (!buf) +			return -ENOBUFS; +		kmsg->fast_iov[0].iov_base = buf; +		kmsg->fast_iov[0].iov_len = sr->len; +		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, +				sr->len);  	} -	flags = req->sr_msg.msg_flags; +	flags = sr->msg_flags;  	if (force_nonblock)  		flags |= MSG_DONTWAIT;  	if (flags & MSG_WAITALL)  		min_ret = iov_iter_count(&kmsg->msg.msg_iter); -	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, -					kmsg->uaddr, flags); +	kmsg->msg.msg_get_inq = 1; +	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);  	if (ret < min_ret) {  		if (ret == -EAGAIN && force_nonblock)  			return io_setup_async_msg(req, kmsg); @@ -5534,45 +6445,54 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)  		ret += sr->done_io;  	else if (sr->done_io)  		ret = sr->done_io; -	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); +	cflags = io_put_kbuf(req, issue_flags); +	if (kmsg->msg.msg_inq) +		cflags |= IORING_CQE_F_SOCK_NONEMPTY; +	__io_req_complete(req, issue_flags, ret, cflags);  	return 0;  }  static int io_recv(struct io_kiocb *req, unsigned int issue_flags)  { -	struct io_buffer *kbuf;  	struct io_sr_msg *sr = &req->sr_msg;  	struct msghdr msg; -	void __user *buf = sr->buf;  	struct socket *sock;  	struct iovec iov; +	unsigned int cflags;  	unsigned flags;  	int ret, min_ret = 0;  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; +	if (!(req->flags & REQ_F_POLLED) && +	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) +		return -EAGAIN; +  	sock = sock_from_file(req->file);  	if (unlikely(!sock))  		return -ENOTSOCK; -	if (req->flags & REQ_F_BUFFER_SELECT) { -		kbuf = io_recv_buffer_select(req, issue_flags); -		if (IS_ERR(kbuf)) -			return PTR_ERR(kbuf); -		buf = u64_to_user_ptr(kbuf->addr); +	if (io_do_buffer_select(req)) { +		void __user *buf; + +		buf = io_buffer_select(req, &sr->len, issue_flags); +		if (!buf) +			return -ENOBUFS; +		sr->buf = buf;  	} -	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); +	ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);  	if (unlikely(ret))  		goto out_free;  	msg.msg_name = NULL; +	msg.msg_namelen = 0;  	msg.msg_control = NULL; +	msg.msg_get_inq = 1; +	msg.msg_flags = 0;  	msg.msg_controllen = 0; -	msg.msg_namelen = 0;  	msg.msg_iocb = NULL; -	msg.msg_flags = 0; -	flags = req->sr_msg.msg_flags; +	flags = sr->msg_flags;  	if (force_nonblock)  		flags |= MSG_DONTWAIT;  	if (flags & MSG_WAITALL) @@ -5601,36 +6521,49 @@ out_free:  		ret += sr->done_io;  	else if (sr->done_io)  		ret = sr->done_io; -	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); +	cflags = io_put_kbuf(req, issue_flags); +	if (msg.msg_inq) +		cflags |= IORING_CQE_F_SOCK_NONEMPTY; +	__io_req_complete(req, issue_flags, ret, cflags);  	return 0;  }  static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	struct io_accept *accept = &req->accept; +	unsigned flags; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->len || sqe->buf_index) +	if (sqe->len || sqe->buf_index)  		return -EINVAL;  	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));  	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));  	accept->flags = READ_ONCE(sqe->accept_flags);  	accept->nofile = rlimit(RLIMIT_NOFILE); +	flags = READ_ONCE(sqe->ioprio); +	if (flags & ~IORING_ACCEPT_MULTISHOT) +		return -EINVAL;  	accept->file_slot = READ_ONCE(sqe->file_index); -	if (accept->file_slot && (accept->flags & SOCK_CLOEXEC)) -		return -EINVAL; +	if (accept->file_slot) { +		if (accept->flags & SOCK_CLOEXEC) +			return -EINVAL; +		if (flags & IORING_ACCEPT_MULTISHOT && +		    accept->file_slot != IORING_FILE_INDEX_ALLOC) +			return -EINVAL; +	}  	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))  		return -EINVAL;  	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))  		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; +	if (flags & IORING_ACCEPT_MULTISHOT) +		req->flags |= REQ_F_APOLL_MULTISHOT;  	return 0;  }  static int io_accept(struct io_kiocb *req, unsigned int issue_flags)  { +	struct io_ring_ctx *ctx = req->ctx;  	struct io_accept *accept = &req->accept;  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;  	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -5638,6 +6571,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)  	struct file *file;  	int ret, fd; +retry:  	if (!fixed) {  		fd = __get_unused_fd_flags(accept->flags, accept->nofile);  		if (unlikely(fd < 0)) @@ -5649,7 +6583,89 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)  		if (!fixed)  			put_unused_fd(fd);  		ret = PTR_ERR(file); -		if (ret == -EAGAIN && force_nonblock) +		if (ret == -EAGAIN && force_nonblock) { +			/* +			 * if it's multishot and polled, we don't need to +			 * return EAGAIN to arm the poll infra since it +			 * has already been done +			 */ +			if ((req->flags & IO_APOLL_MULTI_POLLED) == +			    IO_APOLL_MULTI_POLLED) +				ret = 0; +			return ret; +		} +		if (ret == -ERESTARTSYS) +			ret = -EINTR; +		req_set_fail(req); +	} else if (!fixed) { +		fd_install(fd, file); +		ret = fd; +	} else { +		ret = io_fixed_fd_install(req, issue_flags, file, +						accept->file_slot); +	} + +	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { +		__io_req_complete(req, issue_flags, ret, 0); +		return 0; +	} +	if (ret >= 0) { +		bool filled; + +		spin_lock(&ctx->completion_lock); +		filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, +					 IORING_CQE_F_MORE); +		io_commit_cqring(ctx); +		spin_unlock(&ctx->completion_lock); +		if (filled) { +			io_cqring_ev_posted(ctx); +			goto retry; +		} +		ret = -ECANCELED; +	} + +	return ret; +} + +static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ +	struct io_socket *sock = &req->sock; + +	if (sqe->addr || sqe->rw_flags || sqe->buf_index) +		return -EINVAL; + +	sock->domain = READ_ONCE(sqe->fd); +	sock->type = READ_ONCE(sqe->off); +	sock->protocol = READ_ONCE(sqe->len); +	sock->file_slot = READ_ONCE(sqe->file_index); +	sock->nofile = rlimit(RLIMIT_NOFILE); + +	sock->flags = sock->type & ~SOCK_TYPE_MASK; +	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) +		return -EINVAL; +	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) +		return -EINVAL; +	return 0; +} + +static int io_socket(struct io_kiocb *req, unsigned int issue_flags) +{ +	struct io_socket *sock = &req->sock; +	bool fixed = !!sock->file_slot; +	struct file *file; +	int ret, fd; + +	if (!fixed) { +		fd = __get_unused_fd_flags(sock->flags, sock->nofile); +		if (unlikely(fd < 0)) +			return fd; +	} +	file = __sys_socket_file(sock->domain, sock->type, sock->protocol); +	if (IS_ERR(file)) { +		if (!fixed) +			put_unused_fd(fd); +		ret = PTR_ERR(file); +		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))  			return -EAGAIN;  		if (ret == -ERESTARTSYS)  			ret = -EINTR; @@ -5659,7 +6675,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)  		ret = fd;  	} else {  		ret = io_install_fixed_file(req, file, issue_flags, -					    accept->file_slot - 1); +					    sock->file_slot - 1);  	}  	__io_req_complete(req, issue_flags, ret, 0);  	return 0; @@ -5677,10 +6693,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	struct io_connect *conn = &req->connect; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || -	    sqe->splice_fd_in) +	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)  		return -EINVAL;  	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -5753,112 +6766,11 @@ IO_NETOP_PREP_ASYNC(sendmsg);  IO_NETOP_PREP_ASYNC(recvmsg);  IO_NETOP_PREP_ASYNC(connect);  IO_NETOP_PREP(accept); +IO_NETOP_PREP(socket);  IO_NETOP_FN(send);  IO_NETOP_FN(recv);  #endif /* CONFIG_NET */ -#ifdef CONFIG_NET_RX_BUSY_POLL - -#define NAPI_TIMEOUT			(60 * SEC_CONVERSION) - -struct napi_entry { -	struct list_head	list; -	unsigned int		napi_id; -	unsigned long		timeout; -}; - -/* - * Add busy poll NAPI ID from sk. - */ -static void io_add_napi(struct file *file, struct io_ring_ctx *ctx) -{ -	unsigned int napi_id; -	struct socket *sock; -	struct sock *sk; -	struct napi_entry *ne; - -	if (!net_busy_loop_on()) -		return; - -	sock = sock_from_file(file); -	if (!sock) -		return; - -	sk = sock->sk; -	if (!sk) -		return; - -	napi_id = READ_ONCE(sk->sk_napi_id); - -	/* Non-NAPI IDs can be rejected */ -	if (napi_id < MIN_NAPI_ID) -		return; - -	spin_lock(&ctx->napi_lock); -	list_for_each_entry(ne, &ctx->napi_list, list) { -		if (ne->napi_id == napi_id) { -			ne->timeout = jiffies + NAPI_TIMEOUT; -			goto out; -		} -	} - -	ne = kmalloc(sizeof(*ne), GFP_NOWAIT); -	if (!ne) -		goto out; - -	ne->napi_id = napi_id; -	ne->timeout = jiffies + NAPI_TIMEOUT; -	list_add_tail(&ne->list, &ctx->napi_list); -out: -	spin_unlock(&ctx->napi_lock); -} - -static inline void io_check_napi_entry_timeout(struct napi_entry *ne) -{ -	if (time_after(jiffies, ne->timeout)) { -		list_del(&ne->list); -		kfree(ne); -	} -} - -/* - * Busy poll if globally on and supporting sockets found - */ -static bool io_napi_busy_loop(struct list_head *napi_list) -{ -	struct napi_entry *ne, *n; - -	list_for_each_entry_safe(ne, n, napi_list, list) { -		napi_busy_loop(ne->napi_id, NULL, NULL, true, -			       BUSY_POLL_BUDGET); -		io_check_napi_entry_timeout(ne); -	} -	return !list_empty(napi_list); -} - -static void io_free_napi_list(struct io_ring_ctx *ctx) -{ -	spin_lock(&ctx->napi_lock); -	while (!list_empty(&ctx->napi_list)) { -		struct napi_entry *ne = -			list_first_entry(&ctx->napi_list, struct napi_entry, -					 list); - -		list_del(&ne->list); -		kfree(ne); -	} -	spin_unlock(&ctx->napi_lock); -} -#else -static inline void io_add_napi(struct file *file, struct io_ring_ctx *ctx) -{ -} - -static inline void io_free_napi_list(struct io_ring_ctx *ctx) -{ -} -#endif /* CONFIG_NET_RX_BUSY_POLL */ -  struct io_poll_table {  	struct poll_table_struct pt;  	struct io_kiocb *req; @@ -5905,7 +6817,7 @@ static void io_poll_req_insert(struct io_kiocb *req)  	struct io_ring_ctx *ctx = req->ctx;  	struct hlist_head *list; -	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; +	list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];  	hlist_add_head(&req->hash_node, list);  } @@ -5964,23 +6876,23 @@ static void io_poll_remove_entries(struct io_kiocb *req)  	rcu_read_unlock();  } +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags);  /*   * All poll tw should go through this. Checks for poll events, manages   * references, does rewait, etc.   *   * Returns a negative error on failure. >0 when no action require, which is   * either spurious wakeup or multishot CQE is served. 0 when it's done with - * the request, then the mask is stored in req->result. + * the request, then the mask is stored in req->cqe.res.   */ -static int io_poll_check_events(struct io_kiocb *req) +static int io_poll_check_events(struct io_kiocb *req, bool *locked)  {  	struct io_ring_ctx *ctx = req->ctx; -	struct io_poll_iocb *poll = io_poll_get_single(req); -	int v; +	int v, ret;  	/* req->task == current here, checking PF_EXITING is safe */  	if (unlikely(req->task->flags & PF_EXITING)) -		io_poll_mark_cancelled(req); +		return -ECANCELED;  	do {  		v = atomic_read(&req->poll_refs); @@ -5991,30 +6903,46 @@ static int io_poll_check_events(struct io_kiocb *req)  		if (v & IO_POLL_CANCEL_FLAG)  			return -ECANCELED; -		if (!req->result) { -			struct poll_table_struct pt = { ._key = req->cflags }; +		if (!req->cqe.res) { +			struct poll_table_struct pt = { ._key = req->apoll_events }; +			unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED; -			req->result = vfs_poll(req->file, &pt) & req->cflags; +			if (unlikely(!io_assign_file(req, flags))) +				return -EBADF; +			req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;  		} -		/* multishot, just fill an CQE and proceed */ -		if (req->result && !(req->cflags & EPOLLONESHOT)) { -			__poll_t mask = mangle_poll(req->result & poll->events); +		if ((unlikely(!req->cqe.res))) +			continue; +		if (req->apoll_events & EPOLLONESHOT) +			return 0; + +		/* multishot, just fill a CQE and proceed */ +		if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { +			__poll_t mask = mangle_poll(req->cqe.res & +						    req->apoll_events);  			bool filled;  			spin_lock(&ctx->completion_lock); -			filled = io_fill_cqe_aux(ctx, req->user_data, mask, -						 IORING_CQE_F_MORE); +			filled = io_fill_cqe_aux(ctx, req->cqe.user_data, +						 mask, IORING_CQE_F_MORE);  			io_commit_cqring(ctx);  			spin_unlock(&ctx->completion_lock); -			if (unlikely(!filled)) -				return -ECANCELED; -			io_cqring_ev_posted(ctx); -			io_add_napi(req->file, ctx); -		} else if (req->result) { -			return 0; +			if (filled) { +				io_cqring_ev_posted(ctx); +				continue; +			} +			return -ECANCELED;  		} +		io_tw_lock(req->ctx, locked); +		if (unlikely(req->task->flags & PF_EXITING)) +			return -EFAULT; +		ret = io_issue_sqe(req, +				   IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); +		if (ret) +			return ret; +  		/*  		 * Release all references, retry if someone tried to restart  		 * task_work while we were executing it. @@ -6029,21 +6957,21 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)  	struct io_ring_ctx *ctx = req->ctx;  	int ret; -	ret = io_poll_check_events(req); +	ret = io_poll_check_events(req, locked);  	if (ret > 0)  		return;  	if (!ret) { -		req->result = mangle_poll(req->result & req->poll.events); +		req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);  	} else { -		req->result = ret; +		req->cqe.res = ret;  		req_set_fail(req);  	}  	io_poll_remove_entries(req);  	spin_lock(&ctx->completion_lock);  	hash_del(&req->hash_node); -	__io_req_complete_post(req, req->result, 0); +	__io_req_complete_post(req, req->cqe.res, 0);  	io_commit_cqring(ctx);  	spin_unlock(&ctx->completion_lock);  	io_cqring_ev_posted(ctx); @@ -6054,7 +6982,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)  	struct io_ring_ctx *ctx = req->ctx;  	int ret; -	ret = io_poll_check_events(req); +	ret = io_poll_check_events(req, locked);  	if (ret > 0)  		return; @@ -6069,26 +6997,27 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)  		io_req_complete_failed(req, ret);  } -static void __io_poll_execute(struct io_kiocb *req, int mask, int events) +static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)  { -	req->result = mask; +	req->cqe.res = mask;  	/*  	 * This is useful for poll that is armed on behalf of another  	 * request, and where the wakeup path could be on a different  	 * CPU. We want to avoid pulling in req->apoll->events for that  	 * case.  	 */ -	req->cflags = events; +	req->apoll_events = events;  	if (req->opcode == IORING_OP_POLL_ADD)  		req->io_task_work.func = io_poll_task_func;  	else  		req->io_task_work.func = io_apoll_task_func; -	trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask); -	io_req_task_work_add(req, false); +	trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); +	io_req_task_work_add(req);  } -static inline void io_poll_execute(struct io_kiocb *req, int res, int events) +static inline void io_poll_execute(struct io_kiocb *req, int res, +		__poll_t events)  {  	if (io_poll_get_ownership(req))  		__io_poll_execute(req, res, events); @@ -6103,6 +7032,7 @@ static void io_poll_cancel_req(struct io_kiocb *req)  #define wqe_to_req(wait)	((void *)((unsigned long) (wait)->private & ~1))  #define wqe_is_double(wait)	((unsigned long) (wait)->private & 1) +#define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | EPOLLPRI)  static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,  			void *key) @@ -6137,7 +7067,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,  	}  	/* for instances that support it check for an event match first */ -	if (mask && !(mask & poll->events)) +	if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))  		return 0;  	if (io_poll_get_ownership(req)) { @@ -6223,6 +7153,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,  	int v;  	INIT_HLIST_NODE(&req->hash_node); +	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);  	io_init_poll_iocb(poll, mask, io_poll_wake);  	poll->file = req->file; @@ -6261,7 +7192,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req,  		__io_poll_execute(req, mask, poll->events);  		return 0;  	} -	io_add_napi(req->file, req->ctx);  	/*  	 * Release ownership. If someone tried to queue a tw while it was @@ -6294,28 +7224,34 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)  	struct io_ring_ctx *ctx = req->ctx;  	struct async_poll *apoll;  	struct io_poll_table ipt; -	__poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI; +	__poll_t mask = POLLPRI | POLLERR;  	int ret;  	if (!def->pollin && !def->pollout)  		return IO_APOLL_ABORTED; -	if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED)) +	if (!file_can_poll(req->file))  		return IO_APOLL_ABORTED; +	if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) +		return IO_APOLL_ABORTED; +	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) +		mask |= EPOLLONESHOT;  	if (def->pollin) { -		mask |= POLLIN | POLLRDNORM; +		mask |= EPOLLIN | EPOLLRDNORM;  		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */  		if ((req->opcode == IORING_OP_RECVMSG) &&  		    (req->sr_msg.msg_flags & MSG_ERRQUEUE)) -			mask &= ~POLLIN; +			mask &= ~EPOLLIN;  	} else { -		mask |= POLLOUT | POLLWRNORM; +		mask |= EPOLLOUT | EPOLLWRNORM;  	}  	if (def->poll_exclusive)  		mask |= EPOLLEXCLUSIVE; -	if (!(issue_flags & IO_URING_F_UNLOCKED) && -	    !list_empty(&ctx->apoll_cache)) { +	if (req->flags & REQ_F_POLLED) { +		apoll = req->apoll; +	} else if (!(issue_flags & IO_URING_F_UNLOCKED) && +		   !list_empty(&ctx->apoll_cache)) {  		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,  						poll.wait.entry);  		list_del_init(&apoll->poll.wait.entry); @@ -6335,7 +7271,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)  	if (ret || ipt.error)  		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; -	trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode, +	trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,  				mask, apoll->poll.events);  	return IO_APOLL_OK;  } @@ -6368,24 +7304,53 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,  	return found;  } -static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, -				     bool poll_only) +static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, +				     struct io_cancel_data *cd)  	__must_hold(&ctx->completion_lock)  {  	struct hlist_head *list;  	struct io_kiocb *req; -	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; +	list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)];  	hlist_for_each_entry(req, list, hash_node) { -		if (sqe_addr != req->user_data) +		if (cd->data != req->cqe.user_data)  			continue;  		if (poll_only && req->opcode != IORING_OP_POLL_ADD)  			continue; +		if (cd->flags & IORING_ASYNC_CANCEL_ALL) { +			if (cd->seq == req->work.cancel_seq) +				continue; +			req->work.cancel_seq = cd->seq; +		}  		return req;  	}  	return NULL;  } +static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, +					  struct io_cancel_data *cd) +	__must_hold(&ctx->completion_lock) +{ +	struct io_kiocb *req; +	int i; + +	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { +		struct hlist_head *list; + +		list = &ctx->cancel_hash[i]; +		hlist_for_each_entry(req, list, hash_node) { +			if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && +			    req->file != cd->file) +				continue; +			if (cd->seq == req->work.cancel_seq) +				continue; +			req->work.cancel_seq = cd->seq; +			return req; +		} +	} +	return NULL; +} +  static bool io_poll_disarm(struct io_kiocb *req)  	__must_hold(&ctx->completion_lock)  { @@ -6396,12 +7361,15 @@ static bool io_poll_disarm(struct io_kiocb *req)  	return true;  } -static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, -			  bool poll_only) +static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)  	__must_hold(&ctx->completion_lock)  { -	struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only); +	struct io_kiocb *req; +	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) +		req = io_poll_file_find(ctx, cd); +	else +		req = io_poll_find(ctx, false, cd);  	if (!req)  		return -ENOENT;  	io_poll_cancel_req(req); @@ -6428,9 +7396,7 @@ static int io_poll_update_prep(struct io_kiocb *req,  	struct io_poll_update *upd = &req->poll_update;  	u32 flags; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) +	if (sqe->buf_index || sqe->splice_fd_in)  		return -EINVAL;  	flags = READ_ONCE(sqe->len);  	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | @@ -6460,9 +7426,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe  	struct io_poll_iocb *poll = &req->poll;  	u32 flags; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) +	if (sqe->buf_index || sqe->off || sqe->addr)  		return -EINVAL;  	flags = READ_ONCE(sqe->len);  	if (flags & ~IORING_POLL_ADD_MULTI) @@ -6471,7 +7435,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe  		return -EINVAL;  	io_req_set_refcount(req); -	req->cflags = poll->events = io_poll_parse_events(sqe, flags); +	req->apoll_events = poll->events = io_poll_parse_events(sqe, flags);  	return 0;  } @@ -6492,13 +7456,14 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)  static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)  { +	struct io_cancel_data cd = { .data = req->poll_update.old_user_data, };  	struct io_ring_ctx *ctx = req->ctx;  	struct io_kiocb *preq;  	int ret2, ret = 0;  	bool locked;  	spin_lock(&ctx->completion_lock); -	preq = io_poll_find(ctx, req->poll_update.old_user_data, true); +	preq = io_poll_find(ctx, true, &cd);  	if (!preq || !io_poll_disarm(preq)) {  		spin_unlock(&ctx->completion_lock);  		ret = preq ? -EALREADY : -ENOENT; @@ -6514,7 +7479,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)  			preq->poll.events |= IO_POLL_UNMASK;  		}  		if (req->poll_update.update_user_data) -			preq->user_data = req->poll_update.new_user_data; +			preq->cqe.user_data = req->poll_update.new_user_data;  		ret2 = io_poll_add(preq, issue_flags);  		/* successfully updated, don't complete poll request */ @@ -6523,7 +7488,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)  	}  	req_set_fail(preq); -	preq->result = -ECANCELED; +	preq->cqe.res = -ECANCELED;  	locked = !(issue_flags & IO_URING_F_UNLOCKED);  	io_req_task_complete(preq, &locked);  out: @@ -6551,14 +7516,14 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)  	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))  		req_set_fail(req); -	req->result = -ETIME; +	req->cqe.res = -ETIME;  	req->io_task_work.func = io_req_task_complete; -	io_req_task_work_add(req, false); +	io_req_task_work_add(req);  	return HRTIMER_NORESTART;  }  static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, -					   __u64 user_data) +					   struct io_cancel_data *cd)  	__must_hold(&ctx->timeout_lock)  {  	struct io_timeout_data *io; @@ -6566,9 +7531,16 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,  	bool found = false;  	list_for_each_entry(req, &ctx->timeout_list, timeout.list) { -		found = user_data == req->user_data; -		if (found) -			break; +		if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && +		    cd->data != req->cqe.user_data) +			continue; +		if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { +			if (cd->seq == req->work.cancel_seq) +				continue; +			req->work.cancel_seq = cd->seq; +		} +		found = true; +		break;  	}  	if (!found)  		return ERR_PTR(-ENOENT); @@ -6580,11 +7552,14 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,  	return req;  } -static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)  	__must_hold(&ctx->completion_lock) -	__must_hold(&ctx->timeout_lock)  { -	struct io_kiocb *req = io_timeout_extract(ctx, user_data); +	struct io_kiocb *req; + +	spin_lock_irq(&ctx->timeout_lock); +	req = io_timeout_extract(ctx, cd); +	spin_unlock_irq(&ctx->timeout_lock);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -6617,7 +7592,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,  	bool found = false;  	list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { -		found = user_data == req->user_data; +		found = user_data == req->cqe.user_data;  		if (found)  			break;  	} @@ -6637,7 +7612,8 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,  			     struct timespec64 *ts, enum hrtimer_mode mode)  	__must_hold(&ctx->timeout_lock)  { -	struct io_kiocb *req = io_timeout_extract(ctx, user_data); +	struct io_cancel_data cd = { .data = user_data, }; +	struct io_kiocb *req = io_timeout_extract(ctx, &cd);  	struct io_timeout_data *data;  	if (IS_ERR(req)) @@ -6657,11 +7633,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req,  {  	struct io_timeout_rem *tr = &req->timeout_rem; -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL;  	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))  		return -EINVAL; -	if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) +	if (sqe->buf_index || sqe->len || sqe->splice_fd_in)  		return -EINVAL;  	tr->ltimeout = false; @@ -6702,10 +7676,10 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)  	int ret;  	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { +		struct io_cancel_data cd = { .data = tr->addr, }; +  		spin_lock(&ctx->completion_lock); -		spin_lock_irq(&ctx->timeout_lock); -		ret = io_timeout_cancel(ctx, tr->addr); -		spin_unlock_irq(&ctx->timeout_lock); +		ret = io_timeout_cancel(ctx, &cd);  		spin_unlock(&ctx->completion_lock);  	} else {  		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); @@ -6731,10 +7705,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,  	unsigned flags;  	u32 off = READ_ONCE(sqe->off); -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || -	    sqe->splice_fd_in) +	if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)  		return -EINVAL;  	if (off && is_timeout_link)  		return -EINVAL; @@ -6766,6 +7737,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,  	if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)  		return -EINVAL; +	INIT_LIST_HEAD(&req->timeout.list);  	data->mode = io_translate_timeout_mode(flags);  	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); @@ -6832,30 +7804,42 @@ add:  	return 0;  } -struct io_cancel_data { -	struct io_ring_ctx *ctx; -	u64 user_data; -}; -  static bool io_cancel_cb(struct io_wq_work *work, void *data)  {  	struct io_kiocb *req = container_of(work, struct io_kiocb, work);  	struct io_cancel_data *cd = data; -	return req->ctx == cd->ctx && req->user_data == cd->user_data; +	if (req->ctx != cd->ctx) +		return false; +	if (cd->flags & IORING_ASYNC_CANCEL_ANY) { +		; +	} else if (cd->flags & IORING_ASYNC_CANCEL_FD) { +		if (req->file != cd->file) +			return false; +	} else { +		if (req->cqe.user_data != cd->data) +			return false; +	} +	if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { +		if (cd->seq == req->work.cancel_seq) +			return false; +		req->work.cancel_seq = cd->seq; +	} +	return true;  } -static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, -			       struct io_ring_ctx *ctx) +static int io_async_cancel_one(struct io_uring_task *tctx, +			       struct io_cancel_data *cd)  { -	struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };  	enum io_wq_cancel cancel_ret;  	int ret = 0; +	bool all;  	if (!tctx || !tctx->io_wq)  		return -ENOENT; -	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); +	all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); +	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);  	switch (cancel_ret) {  	case IO_WQ_CANCEL_OK:  		ret = 0; @@ -6871,14 +7855,14 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,  	return ret;  } -static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) +static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)  {  	struct io_ring_ctx *ctx = req->ctx;  	int ret;  	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); -	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); +	ret = io_async_cancel_one(req->task->io_uring, cd);  	/*  	 * Fall-through even for -EALREADY, as we may have poll armed  	 * that need unarming. @@ -6887,56 +7871,98 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)  		return 0;  	spin_lock(&ctx->completion_lock); -	ret = io_poll_cancel(ctx, sqe_addr, false); +	ret = io_poll_cancel(ctx, cd);  	if (ret != -ENOENT)  		goto out; - -	spin_lock_irq(&ctx->timeout_lock); -	ret = io_timeout_cancel(ctx, sqe_addr); -	spin_unlock_irq(&ctx->timeout_lock); +	if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) +		ret = io_timeout_cancel(ctx, cd);  out:  	spin_unlock(&ctx->completion_lock);  	return ret;  } +#define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ +			 IORING_ASYNC_CANCEL_ANY) +  static int io_async_cancel_prep(struct io_kiocb *req,  				const struct io_uring_sqe *sqe)  { -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) -		return -EINVAL; -	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) +	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))  		return -EINVAL; -	if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || -	    sqe->splice_fd_in) +	if (sqe->off || sqe->len || sqe->splice_fd_in)  		return -EINVAL;  	req->cancel.addr = READ_ONCE(sqe->addr); +	req->cancel.flags = READ_ONCE(sqe->cancel_flags); +	if (req->cancel.flags & ~CANCEL_FLAGS) +		return -EINVAL; +	if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) { +		if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY) +			return -EINVAL; +		req->cancel.fd = READ_ONCE(sqe->fd); +	} +  	return 0;  } -static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) +static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, +			     unsigned int issue_flags)  { -	struct io_ring_ctx *ctx = req->ctx; -	u64 sqe_addr = req->cancel.addr; -	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; +	bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); +	struct io_ring_ctx *ctx = cd->ctx;  	struct io_tctx_node *node; -	int ret; +	int ret, nr = 0; -	ret = io_try_cancel_userdata(req, sqe_addr); -	if (ret != -ENOENT) -		goto done; +	do { +		ret = io_try_cancel(req, cd); +		if (ret == -ENOENT) +			break; +		if (!all) +			return ret; +		nr++; +	} while (1);  	/* slow path, try all io-wq's */ -	io_ring_submit_lock(ctx, needs_lock); +	io_ring_submit_lock(ctx, issue_flags);  	ret = -ENOENT;  	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {  		struct io_uring_task *tctx = node->task->io_uring; -		ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); -		if (ret != -ENOENT) -			break; +		ret = io_async_cancel_one(tctx, cd); +		if (ret != -ENOENT) { +			if (!all) +				break; +			nr++; +		}  	} -	io_ring_submit_unlock(ctx, needs_lock); +	io_ring_submit_unlock(ctx, issue_flags); +	return all ? nr : ret; +} + +static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) +{ +	struct io_cancel_data cd = { +		.ctx	= req->ctx, +		.data	= req->cancel.addr, +		.flags	= req->cancel.flags, +		.seq	= atomic_inc_return(&req->ctx->cancel_seq), +	}; +	int ret; + +	if (cd.flags & IORING_ASYNC_CANCEL_FD) { +		if (req->flags & REQ_F_FIXED_FILE) +			req->file = io_file_get_fixed(req, req->cancel.fd, +							issue_flags); +		else +			req->file = io_file_get_normal(req, req->cancel.fd); +		if (!req->file) { +			ret = -EBADF; +			goto done; +		} +		cd.file = req->file; +	} + +	ret = __io_async_cancel(&cd, req, issue_flags);  done:  	if (ret < 0)  		req_set_fail(req); @@ -6949,7 +7975,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,  {  	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))  		return -EINVAL; -	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) +	if (sqe->rw_flags || sqe->splice_fd_in)  		return -EINVAL;  	req->rsrc_update.offset = READ_ONCE(sqe->off); @@ -6963,7 +7989,6 @@ static int io_rsrc_update_prep(struct io_kiocb *req,  static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_ring_ctx *ctx = req->ctx; -	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;  	struct io_uring_rsrc_update2 up;  	int ret; @@ -6972,11 +7997,12 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)  	up.nr = 0;  	up.tags = 0;  	up.resv = 0; +	up.resv2 = 0; -	io_ring_submit_lock(ctx, needs_lock); +	io_ring_submit_lock(ctx, issue_flags);  	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,  					&up, req->rsrc_update.nr_args); -	io_ring_submit_unlock(ctx, needs_lock); +	io_ring_submit_unlock(ctx, issue_flags);  	if (ret < 0)  		req_set_fail(req); @@ -6988,15 +8014,14 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	switch (req->opcode) {  	case IORING_OP_NOP: -		return 0; +		return io_nop_prep(req, sqe);  	case IORING_OP_READV:  	case IORING_OP_READ_FIXED:  	case IORING_OP_READ: -		return io_read_prep(req, sqe);  	case IORING_OP_WRITEV:  	case IORING_OP_WRITE_FIXED:  	case IORING_OP_WRITE: -		return io_write_prep(req, sqe); +		return io_prep_rw(req, sqe);  	case IORING_OP_POLL_ADD:  		return io_poll_add_prep(req, sqe);  	case IORING_OP_POLL_REMOVE: @@ -7063,6 +8088,18 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  		return io_linkat_prep(req, sqe);  	case IORING_OP_MSG_RING:  		return io_msg_ring_prep(req, sqe); +	case IORING_OP_FSETXATTR: +		return io_fsetxattr_prep(req, sqe); +	case IORING_OP_SETXATTR: +		return io_setxattr_prep(req, sqe); +	case IORING_OP_FGETXATTR: +		return io_fgetxattr_prep(req, sqe); +	case IORING_OP_GETXATTR: +		return io_getxattr_prep(req, sqe); +	case IORING_OP_SOCKET: +		return io_socket_prep(req, sqe); +	case IORING_OP_URING_CMD: +		return io_uring_cmd_prep(req, sqe);  	}  	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -7072,7 +8109,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  static int io_req_prep_async(struct io_kiocb *req)  { -	if (!io_op_defs[req->opcode].needs_async_setup) +	const struct io_op_def *def = &io_op_defs[req->opcode]; + +	/* assign early for deferred execution for non-fixed file */ +	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) +		req->file = io_file_get_normal(req, req->cqe.fd); +	if (!def->needs_async_setup)  		return 0;  	if (WARN_ON_ONCE(req_has_async_data(req)))  		return -EFAULT; @@ -7090,6 +8132,8 @@ static int io_req_prep_async(struct io_kiocb *req)  		return io_recvmsg_prep_async(req);  	case IORING_OP_CONNECT:  		return io_connect_prep_async(req); +	case IORING_OP_URING_CMD: +		return io_uring_cmd_prep_async(req);  	}  	printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",  		    req->opcode); @@ -7099,9 +8143,10 @@ static int io_req_prep_async(struct io_kiocb *req)  static u32 io_get_sequence(struct io_kiocb *req)  {  	u32 seq = req->ctx->cached_sq_head; +	struct io_kiocb *cur;  	/* need original cached_sq_head, but it was increased for each req */ -	io_for_each_link(req, req) +	io_for_each_link(cur, req)  		seq--;  	return seq;  } @@ -7144,7 +8189,7 @@ fail:  		goto queue;  	} -	trace_io_uring_defer(ctx, req, req->user_data, req->opcode); +	trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);  	de->req = req;  	de->seq = seq;  	list_add_tail(&de->list, &ctx->defer_list); @@ -7179,11 +8224,6 @@ static void io_clean_op(struct io_kiocb *req)  			kfree(io->free_iov);  			break;  			} -		case IORING_OP_SPLICE: -		case IORING_OP_TEE: -			if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED)) -				io_put_file(req->splice.file_in); -			break;  		case IORING_OP_OPENAT:  		case IORING_OP_OPENAT2:  			if (req->open.filename) @@ -7211,6 +8251,12 @@ static void io_clean_op(struct io_kiocb *req)  			if (req->statx.filename)  				putname(req->statx.filename);  			break; +		case IORING_OP_SETXATTR: +		case IORING_OP_FSETXATTR: +		case IORING_OP_GETXATTR: +		case IORING_OP_FGETXATTR: +			__io_xattr_finish(req); +			break;  		}  	}  	if ((req->flags & REQ_F_POLLED) && req->apoll) { @@ -7218,11 +8264,6 @@ static void io_clean_op(struct io_kiocb *req)  		kfree(req->apoll);  		req->apoll = NULL;  	} -	if (req->flags & REQ_F_INFLIGHT) { -		struct io_uring_task *tctx = req->task->io_uring; - -		atomic_dec(&tctx->inflight_tracked); -	}  	if (req->flags & REQ_F_CREDS)  		put_cred(req->creds);  	if (req->flags & REQ_F_ASYNC_DATA) { @@ -7232,11 +8273,27 @@ static void io_clean_op(struct io_kiocb *req)  	req->flags &= ~IO_REQ_CLEAN_FLAGS;  } +static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) +{ +	if (req->file || !io_op_defs[req->opcode].needs_file) +		return true; + +	if (req->flags & REQ_F_FIXED_FILE) +		req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); +	else +		req->file = io_file_get_normal(req, req->cqe.fd); + +	return !!req->file; +} +  static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)  {  	const struct cred *creds = NULL;  	int ret; +	if (unlikely(!io_assign_file(req, issue_flags))) +		return -EBADF; +  	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))  		creds = override_creds(req->creds); @@ -7356,6 +8413,24 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)  	case IORING_OP_MSG_RING:  		ret = io_msg_ring(req, issue_flags);  		break; +	case IORING_OP_FSETXATTR: +		ret = io_fsetxattr(req, issue_flags); +		break; +	case IORING_OP_SETXATTR: +		ret = io_setxattr(req, issue_flags); +		break; +	case IORING_OP_FGETXATTR: +		ret = io_fgetxattr(req, issue_flags); +		break; +	case IORING_OP_GETXATTR: +		ret = io_getxattr(req, issue_flags); +		break; +	case IORING_OP_SOCKET: +		ret = io_socket(req, issue_flags); +		break; +	case IORING_OP_URING_CMD: +		ret = io_uring_cmd(req, issue_flags); +		break;  	default:  		ret = -EINVAL;  		break; @@ -7386,10 +8461,10 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)  static void io_wq_submit_work(struct io_wq_work *work)  {  	struct io_kiocb *req = container_of(work, struct io_kiocb, work); +	const struct io_op_def *def = &io_op_defs[req->opcode];  	unsigned int issue_flags = IO_URING_F_UNLOCKED;  	bool needs_poll = false; -	struct io_kiocb *timeout; -	int ret = 0; +	int ret = 0, err = -ECANCELED;  	/* one will be dropped by ->io_free_work() after returning to io-wq */  	if (!(req->flags & REQ_F_REFCOUNT)) @@ -7397,18 +8472,21 @@ static void io_wq_submit_work(struct io_wq_work *work)  	else  		req_ref_get(req); -	timeout = io_prep_linked_timeout(req); -	if (timeout) -		io_queue_linked_timeout(timeout); +	io_arm_ltimeout(req);  	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */  	if (work->flags & IO_WQ_WORK_CANCEL) { -		io_req_task_queue_fail(req, -ECANCELED); +fail: +		io_req_task_queue_fail(req, err);  		return;  	} +	if (!io_assign_file(req, issue_flags)) { +		err = -EBADF; +		work->flags |= IO_WQ_WORK_CANCEL; +		goto fail; +	}  	if (req->flags & REQ_F_FORCE_ASYNC) { -		const struct io_op_def *def = &io_op_defs[req->opcode];  		bool opcode_poll = def->pollin || def->pollout;  		if (opcode_poll && file_can_poll(req->file)) { @@ -7427,6 +8505,8 @@ static void io_wq_submit_work(struct io_wq_work *work)  		 * wait for request slots on the block side.  		 */  		if (!needs_poll) { +			if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) +				break;  			cond_resched();  			continue;  		} @@ -7465,54 +8545,69 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file  	file_slot->file_ptr = file_ptr;  } -static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, -					     struct io_kiocb *req, int fd) +static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, +					     unsigned int issue_flags)  { -	struct file *file; +	struct io_ring_ctx *ctx = req->ctx; +	struct file *file = NULL;  	unsigned long file_ptr; +	io_ring_submit_lock(ctx, issue_flags); +  	if (unlikely((unsigned int)fd >= ctx->nr_user_files)) -		return NULL; +		goto out;  	fd = array_index_nospec(fd, ctx->nr_user_files);  	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;  	file = (struct file *) (file_ptr & FFS_MASK);  	file_ptr &= ~FFS_MASK;  	/* mask in overlapping REQ_F and FFS bits */  	req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); -	io_req_set_rsrc_node(req, ctx); +	io_req_set_rsrc_node(req, ctx, 0); +	WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap)); +out: +	io_ring_submit_unlock(ctx, issue_flags);  	return file;  } -static struct file *io_file_get_normal(struct io_ring_ctx *ctx, -				       struct io_kiocb *req, int fd) +/* + * Drop the file for requeue operations. Only used of req->file is the + * io_uring descriptor itself. + */ +static void io_drop_inflight_file(struct io_kiocb *req) +{ +	if (unlikely(req->flags & REQ_F_INFLIGHT)) { +		fput(req->file); +		req->file = NULL; +		req->flags &= ~REQ_F_INFLIGHT; +	} +} + +static struct file *io_file_get_normal(struct io_kiocb *req, int fd)  {  	struct file *file = fget(fd); -	trace_io_uring_file_get(ctx, req, req->user_data, fd); +	trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);  	/* we don't allow fixed io_uring files */ -	if (file && unlikely(file->f_op == &io_uring_fops)) -		io_req_track_inflight(req); +	if (file && file->f_op == &io_uring_fops) +		req->flags |= REQ_F_INFLIGHT;  	return file;  } -static inline struct file *io_file_get(struct io_ring_ctx *ctx, -				       struct io_kiocb *req, int fd, bool fixed) -{ -	if (fixed) -		return io_file_get_fixed(ctx, req, fd); -	else -		return io_file_get_normal(ctx, req, fd); -} -  static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)  {  	struct io_kiocb *prev = req->timeout.prev;  	int ret = -ENOENT;  	if (prev) { -		if (!(req->task->flags & PF_EXITING)) -			ret = io_try_cancel_userdata(req, prev->user_data); +		if (!(req->task->flags & PF_EXITING)) { +			struct io_cancel_data cd = { +				.ctx		= req->ctx, +				.data		= prev->cqe.user_data, +			}; + +			ret = io_try_cancel(req, &cd); +		}  		io_req_complete_post(req, ret ?: -ETIME, 0);  		io_put_req(prev);  	} else { @@ -7546,7 +8641,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)  	spin_unlock_irqrestore(&ctx->timeout_lock, flags);  	req->io_task_work.func = io_req_task_link_timeout; -	io_req_task_work_add(req, false); +	io_req_task_work_add(req);  	return HRTIMER_NORESTART;  } @@ -7572,10 +8667,17 @@ static void io_queue_linked_timeout(struct io_kiocb *req)  	io_put_req(req);  } -static void io_queue_sqe_arm_apoll(struct io_kiocb *req) +static void io_queue_async(struct io_kiocb *req, int ret)  	__must_hold(&req->ctx->uring_lock)  { -	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); +	struct io_kiocb *linked_timeout; + +	if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { +		io_req_complete_failed(req, ret); +		return; +	} + +	linked_timeout = io_prep_linked_timeout(req);  	switch (io_arm_poll_handler(req, 0)) {  	case IO_APOLL_READY: @@ -7586,7 +8688,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)  		 * Queued up for async execution, worker will release  		 * submit reference when the iocb is actually submitted.  		 */ -		io_queue_async_work(req, NULL); +		io_queue_iowq(req, NULL);  		break;  	case IO_APOLL_OK:  		break; @@ -7596,10 +8698,9 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)  		io_queue_linked_timeout(linked_timeout);  } -static inline void __io_queue_sqe(struct io_kiocb *req) +static inline void io_queue_sqe(struct io_kiocb *req)  	__must_hold(&req->ctx->uring_lock)  { -	struct io_kiocb *linked_timeout;  	int ret;  	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); @@ -7612,22 +8713,23 @@ static inline void __io_queue_sqe(struct io_kiocb *req)  	 * We async punt it if the file wasn't marked NOWAIT, or if the file  	 * doesn't support non-blocking read/write attempts  	 */ -	if (likely(!ret)) { -		linked_timeout = io_prep_linked_timeout(req); -		if (linked_timeout) -			io_queue_linked_timeout(linked_timeout); -	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { -		io_queue_sqe_arm_apoll(req); -	} else { -		io_req_complete_failed(req, ret); -	} +	if (likely(!ret)) +		io_arm_ltimeout(req); +	else +		io_queue_async(req, ret);  }  static void io_queue_sqe_fallback(struct io_kiocb *req)  	__must_hold(&req->ctx->uring_lock)  { -	if (req->flags & REQ_F_FAIL) { -		io_req_complete_fail_submit(req); +	if (unlikely(req->flags & REQ_F_FAIL)) { +		/* +		 * We don't submit, fail them all, for that replace hardlinks +		 * with normal links. Extra REQ_F_LINK is tolerated. +		 */ +		req->flags &= ~REQ_F_HARDLINK; +		req->flags |= REQ_F_LINK; +		io_req_complete_failed(req, req->cqe.res);  	} else if (unlikely(req->ctx->drain_active)) {  		io_drain_req(req);  	} else { @@ -7636,19 +8738,10 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)  		if (unlikely(ret))  			io_req_complete_failed(req, ret);  		else -			io_queue_async_work(req, NULL); +			io_queue_iowq(req, NULL);  	}  } -static inline void io_queue_sqe(struct io_kiocb *req) -	__must_hold(&req->ctx->uring_lock) -{ -	if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) -		__io_queue_sqe(req); -	else -		io_queue_sqe_fallback(req); -} -  /*   * Check SQE restrictions (opcode and flags).   * @@ -7703,9 +8796,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,  	req->opcode = opcode = READ_ONCE(sqe->opcode);  	/* same numerical values with corresponding REQ_F_*, safe to copy */  	req->flags = sqe_flags = READ_ONCE(sqe->flags); -	req->user_data = READ_ONCE(sqe->user_data); +	req->cqe.user_data = READ_ONCE(sqe->user_data);  	req->file = NULL; -	req->fixed_rsrc_refs = NULL; +	req->rsrc_node = NULL;  	req->task = current;  	if (unlikely(opcode >= IORING_OP_LAST)) { @@ -7716,9 +8809,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,  		/* enforce forwards compatibility on users */  		if (sqe_flags & ~SQE_VALID_FLAGS)  			return -EINVAL; -		if ((sqe_flags & IOSQE_BUFFER_SELECT) && -		    !io_op_defs[opcode].buffer_select) -			return -EOPNOTSUPP; +		if (sqe_flags & IOSQE_BUFFER_SELECT) { +			if (!io_op_defs[opcode].buffer_select) +				return -EOPNOTSUPP; +			req->buf_index = READ_ONCE(sqe->buf_group); +		}  		if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)  			ctx->drain_disabled = true;  		if (sqe_flags & IOSQE_IO_DRAIN) { @@ -7741,9 +8836,16 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,  		}  	} +	if (!io_op_defs[opcode].ioprio && sqe->ioprio) +		return -EINVAL; +	if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) +		return -EINVAL; +  	if (io_op_defs[opcode].needs_file) {  		struct io_submit_state *state = &ctx->submit_state; +		req->cqe.fd = READ_ONCE(sqe->fd); +  		/*  		 * Plug now if we have more than 2 IO left after this, and the  		 * target is potentially a read/write to block based storage. @@ -7753,11 +8855,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,  			state->need_plug = false;  			blk_start_plug_nr_ios(&state->plug, state->submit_nr);  		} - -		req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), -					(sqe_flags & IOSQE_FIXED_FILE)); -		if (unlikely(!req->file)) -			return -EBADF;  	}  	personality = READ_ONCE(sqe->personality); @@ -7779,7 +8876,44 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,  	return io_req_prep(req, sqe);  } -static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, +static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, +				      struct io_kiocb *req, int ret) +{ +	struct io_ring_ctx *ctx = req->ctx; +	struct io_submit_link *link = &ctx->submit_state.link; +	struct io_kiocb *head = link->head; + +	trace_io_uring_req_failed(sqe, ctx, req, ret); + +	/* +	 * Avoid breaking links in the middle as it renders links with SQPOLL +	 * unusable. Instead of failing eagerly, continue assembling the link if +	 * applicable and mark the head with REQ_F_FAIL. The link flushing code +	 * should find the flag and handle the rest. +	 */ +	req_fail_link_node(req, ret); +	if (head && !(head->flags & REQ_F_FAIL)) +		req_fail_link_node(head, -ECANCELED); + +	if (!(req->flags & IO_REQ_LINK_FLAGS)) { +		if (head) { +			link->last->link = req; +			link->head = NULL; +			req = head; +		} +		io_queue_sqe_fallback(req); +		return ret; +	} + +	if (head) +		link->last->link = req; +	else +		link->head = req; +	link->last = req; +	return 0; +} + +static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,  			 const struct io_uring_sqe *sqe)  	__must_hold(&ctx->uring_lock)  { @@ -7787,35 +8921,11 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,  	int ret;  	ret = io_init_req(ctx, req, sqe); -	if (unlikely(ret)) { -		trace_io_uring_req_failed(sqe, ctx, req, ret); - -		/* fail even hard links since we don't submit */ -		if (link->head) { -			/* -			 * we can judge a link req is failed or cancelled by if -			 * REQ_F_FAIL is set, but the head is an exception since -			 * it may be set REQ_F_FAIL because of other req's failure -			 * so let's leverage req->result to distinguish if a head -			 * is set REQ_F_FAIL because of its failure or other req's -			 * failure so that we can set the correct ret code for it. -			 * init result here to avoid affecting the normal path. -			 */ -			if (!(link->head->flags & REQ_F_FAIL)) -				req_fail_link_node(link->head, -ECANCELED); -		} else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { -			/* -			 * the current req is a normal req, we should return -			 * error and thus break the submittion loop. -			 */ -			io_req_complete_failed(req, ret); -			return ret; -		} -		req_fail_link_node(req, ret); -	} +	if (unlikely(ret)) +		return io_submit_fail_init(sqe, req, ret);  	/* don't need @sqe from now on */ -	trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode, +	trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,  				  req->flags, true,  				  ctx->flags & IORING_SETUP_SQPOLL); @@ -7826,29 +8936,32 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,  	 * submitted sync once the chain is complete. If none of those  	 * conditions are true (normal request), then just queue it.  	 */ -	if (link->head) { -		struct io_kiocb *head = link->head; +	if (unlikely(link->head)) { +		ret = io_req_prep_async(req); +		if (unlikely(ret)) +			return io_submit_fail_init(sqe, req, ret); -		if (!(req->flags & REQ_F_FAIL)) { -			ret = io_req_prep_async(req); -			if (unlikely(ret)) { -				req_fail_link_node(req, ret); -				if (!(head->flags & REQ_F_FAIL)) -					req_fail_link_node(head, -ECANCELED); -			} -		} -		trace_io_uring_link(ctx, req, head); +		trace_io_uring_link(ctx, req, link->head);  		link->last->link = req;  		link->last = req; -		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) +		if (req->flags & IO_REQ_LINK_FLAGS)  			return 0; -		/* last request of a link, enqueue the link */ +		/* last request of the link, flush it */ +		req = link->head;  		link->head = NULL; -		req = head; -	} else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { -		link->head = req; -		link->last = req; +		if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) +			goto fallback; + +	} else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | +					  REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { +		if (req->flags & IO_REQ_LINK_FLAGS) { +			link->head = req; +			link->last = req; +		} else { +fallback: +			io_queue_sqe_fallback(req); +		}  		return 0;  	} @@ -7863,8 +8976,8 @@ static void io_submit_state_end(struct io_ring_ctx *ctx)  {  	struct io_submit_state *state = &ctx->submit_state; -	if (state->link.head) -		io_queue_sqe(state->link.head); +	if (unlikely(state->link.head)) +		io_queue_sqe_fallback(state->link.head);  	/* flush only after queuing links as they can generate completions */  	io_submit_flush_completions(ctx);  	if (state->plug_started) @@ -7918,8 +9031,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)  	 *    though the application is the one updating it.  	 */  	head = READ_ONCE(ctx->sq_array[sq_idx]); -	if (likely(head < ctx->sq_entries)) +	if (likely(head < ctx->sq_entries)) { +		/* double index for 128-byte SQEs, twice as long */ +		if (ctx->flags & IORING_SETUP_SQE128) +			head <<= 1;  		return &ctx->sq_sqes[head]; +	}  	/* drop invalid entries */  	ctx->cq_extra--; @@ -7932,54 +9049,52 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)  	__must_hold(&ctx->uring_lock)  {  	unsigned int entries = io_sqring_entries(ctx); -	int submitted = 0; +	unsigned int left; +	int ret;  	if (unlikely(!entries))  		return 0;  	/* make sure SQ entry isn't read before tail */ -	nr = min3(nr, ctx->sq_entries, entries); -	io_get_task_refs(nr); +	ret = left = min3(nr, ctx->sq_entries, entries); +	io_get_task_refs(left); +	io_submit_state_start(&ctx->submit_state, left); -	io_submit_state_start(&ctx->submit_state, nr);  	do {  		const struct io_uring_sqe *sqe;  		struct io_kiocb *req; -		if (unlikely(!io_alloc_req_refill(ctx))) { -			if (!submitted) -				submitted = -EAGAIN; +		if (unlikely(!io_alloc_req_refill(ctx)))  			break; -		}  		req = io_alloc_req(ctx);  		sqe = io_get_sqe(ctx);  		if (unlikely(!sqe)) { -			wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); +			io_req_add_to_cache(req, ctx);  			break;  		} -		/* will complete beyond this point, count as submitted */ -		submitted++; -		if (io_submit_sqe(ctx, req, sqe)) { -			/* -			 * Continue submitting even for sqe failure if the -			 * ring was setup with IORING_SETUP_SUBMIT_ALL -			 */ -			if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) -				break; -		} -	} while (submitted < nr); -	if (unlikely(submitted != nr)) { -		int ref_used = (submitted == -EAGAIN) ? 0 : submitted; -		int unused = nr - ref_used; +		/* +		 * Continue submitting even for sqe failure if the +		 * ring was setup with IORING_SETUP_SUBMIT_ALL +		 */ +		if (unlikely(io_submit_sqe(ctx, req, sqe)) && +		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { +			left--; +			break; +		} +	} while (--left); -		current->io_uring->cached_refs += unused; +	if (unlikely(left)) { +		ret -= left; +		/* try again if it submitted nothing and can't allocate a req */ +		if (!ret && io_req_cache_empty(ctx)) +			ret = -EAGAIN; +		current->io_uring->cached_refs += left;  	}  	io_submit_state_end(ctx);  	 /* Commit SQ ring head once we've consumed and submitted all SQEs */  	io_commit_sqring(ctx); - -	return submitted; +	return ret;  }  static inline bool io_sqd_events_pending(struct io_sq_data *sqd) @@ -7987,23 +9102,6 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)  	return READ_ONCE(sqd->state);  } -static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) -{ -	/* Tell userspace we may need a wakeup call */ -	spin_lock(&ctx->completion_lock); -	WRITE_ONCE(ctx->rings->sq_flags, -		   ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); -	spin_unlock(&ctx->completion_lock); -} - -static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) -{ -	spin_lock(&ctx->completion_lock); -	WRITE_ONCE(ctx->rings->sq_flags, -		   ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); -	spin_unlock(&ctx->completion_lock); -} -  static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)  {  	unsigned int to_submit; @@ -8032,13 +9130,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)  		    !(ctx->flags & IORING_SETUP_R_DISABLED))  			ret = io_submit_sqes(ctx, to_submit);  		mutex_unlock(&ctx->uring_lock); -#ifdef CONFIG_NET_RX_BUSY_POLL -		spin_lock(&ctx->napi_lock); -		if (!list_empty(&ctx->napi_list) && -		    io_napi_busy_loop(&ctx->napi_list)) -			++ret; -		spin_unlock(&ctx->napi_lock); -#endif +  		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))  			wake_up(&ctx->sqo_sq_wait);  		if (creds) @@ -8125,8 +9217,8 @@ static int io_sq_thread(void *data)  			bool needs_sched = true;  			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { -				io_ring_set_wakeup_flag(ctx); - +				atomic_or(IORING_SQ_NEED_WAKEUP, +						&ctx->rings->sq_flags);  				if ((ctx->flags & IORING_SETUP_IOPOLL) &&  				    !wq_list_empty(&ctx->iopoll_list)) {  					needs_sched = false; @@ -8137,7 +9229,7 @@ static int io_sq_thread(void *data)  				 * Ensure the store of the wakeup flag is not  				 * reordered with the load of the SQ tail  				 */ -				smp_mb(); +				smp_mb__after_atomic();  				if (io_sqring_entries(ctx)) {  					needs_sched = false; @@ -8151,7 +9243,8 @@ static int io_sq_thread(void *data)  				mutex_lock(&sqd->lock);  			}  			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) -				io_ring_clear_wakeup_flag(ctx); +				atomic_andnot(IORING_SQ_NEED_WAKEUP, +						&ctx->rings->sq_flags);  		}  		finish_wait(&sqd->wait, &wait); @@ -8161,7 +9254,7 @@ static int io_sq_thread(void *data)  	io_uring_cancel_generic(true, sqd);  	sqd->thread = NULL;  	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) -		io_ring_set_wakeup_flag(ctx); +		atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);  	io_run_task_work();  	mutex_unlock(&sqd->lock); @@ -8176,9 +9269,6 @@ struct io_wait_queue {  	struct io_ring_ctx *ctx;  	unsigned cq_tail;  	unsigned nr_timeouts; -#ifdef CONFIG_NET_RX_BUSY_POLL -	unsigned busy_poll_to; -#endif  };  static inline bool io_should_wake(struct io_wait_queue *iowq) @@ -8204,7 +9294,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,  	 * Cannot safely flush overflowed CQEs from here, ensure we wake up  	 * the task, and the next invocation will do it.  	 */ -	if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) +	if (io_should_wake(iowq) || +	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))  		return autoremove_wake_function(curr, mode, wake_flags, key);  	return -1;  } @@ -8226,101 +9317,23 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,  					  ktime_t timeout)  {  	int ret; +	unsigned long check_cq;  	/* make sure we run task_work before checking for signals */  	ret = io_run_task_work_sig();  	if (ret || io_should_wake(iowq))  		return ret; +	check_cq = READ_ONCE(ctx->check_cq);  	/* let the caller flush overflows, retry */ -	if (test_bit(0, &ctx->check_cq_overflow)) +	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))  		return 1; - +	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) +		return -EBADR;  	if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))  		return -ETIME;  	return 1;  } -#ifdef CONFIG_NET_RX_BUSY_POLL -static void io_adjust_busy_loop_timeout(struct timespec64 *ts, -					struct io_wait_queue *iowq) -{ -	unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll); -	struct timespec64 pollto = ns_to_timespec64(1000 * (s64)busy_poll_to); - -	if (timespec64_compare(ts, &pollto) > 0) { -		*ts = timespec64_sub(*ts, pollto); -		iowq->busy_poll_to = busy_poll_to; -	} else { -		u64 to = timespec64_to_ns(ts); - -		do_div(to, 1000); -		iowq->busy_poll_to = to; -		ts->tv_sec = 0; -		ts->tv_nsec = 0; -	} -} - -static inline bool io_busy_loop_timeout(unsigned long start_time, -					unsigned long bp_usec) -{ -	if (bp_usec) { -		unsigned long end_time = start_time + bp_usec; -		unsigned long now = busy_loop_current_time(); - -		return time_after(now, end_time); -	} -	return true; -} - -static bool io_busy_loop_end(void *p, unsigned long start_time) -{ -	struct io_wait_queue *iowq = p; - -	return signal_pending(current) || -	       io_should_wake(iowq) || -	       io_busy_loop_timeout(start_time, iowq->busy_poll_to); -} - -static void io_blocking_napi_busy_loop(struct list_head *napi_list, -				       struct io_wait_queue *iowq) -{ -	unsigned long start_time = -		list_is_singular(napi_list) ? 0 : -		busy_loop_current_time(); - -	do { -		if (list_is_singular(napi_list)) { -			struct napi_entry *ne = -				list_first_entry(napi_list, -						 struct napi_entry, list); - -			napi_busy_loop(ne->napi_id, io_busy_loop_end, iowq, -				       true, BUSY_POLL_BUDGET); -			io_check_napi_entry_timeout(ne); -			break; -		} -	} while (io_napi_busy_loop(napi_list) && -		 !io_busy_loop_end(iowq, start_time)); -} - -static void io_putback_napi_list(struct io_ring_ctx *ctx, -				 struct list_head *napi_list) -{ -	struct napi_entry *cne, *lne; - -	spin_lock(&ctx->napi_lock); -	list_for_each_entry(cne, &ctx->napi_list, list) -		list_for_each_entry(lne, napi_list, list) -			if (cne->napi_id == lne->napi_id) { -				list_del(&lne->list); -				kfree(lne); -				break; -			} -	list_splice(napi_list, &ctx->napi_list); -	spin_unlock(&ctx->napi_lock); -} -#endif /* CONFIG_NET_RX_BUSY_POLL */ -  /*   * Wait until events become available, if we don't already have some. The   * application must reap them itself, as they reside on the shared cq ring. @@ -8333,9 +9346,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,  	struct io_rings *rings = ctx->rings;  	ktime_t timeout = KTIME_MAX;  	int ret; -#ifdef CONFIG_NET_RX_BUSY_POLL -	LIST_HEAD(local_napi_list); -#endif  	do {  		io_cqring_overflow_flush(ctx); @@ -8358,29 +9368,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,  			return ret;  	} -#ifdef CONFIG_NET_RX_BUSY_POLL -	iowq.busy_poll_to = 0; -	if (!(ctx->flags & IORING_SETUP_SQPOLL)) { -		spin_lock(&ctx->napi_lock); -		list_splice_init(&ctx->napi_list, &local_napi_list); -		spin_unlock(&ctx->napi_lock); -	} -#endif  	if (uts) {  		struct timespec64 ts;  		if (get_timespec64(&ts, uts))  			return -EFAULT; -#ifdef CONFIG_NET_RX_BUSY_POLL -		if (!list_empty(&local_napi_list)) -			io_adjust_busy_loop_timeout(&ts, &iowq); -#endif  		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());  	} -#ifdef CONFIG_NET_RX_BUSY_POLL -	else if (!list_empty(&local_napi_list)) -		iowq.busy_poll_to = READ_ONCE(sysctl_net_busy_poll); -#endif  	init_waitqueue_func_entry(&iowq.wq, io_wake_function);  	iowq.wq.private = current; @@ -8390,12 +9384,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,  	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;  	trace_io_uring_cqring_wait(ctx, min_events); -#ifdef CONFIG_NET_RX_BUSY_POLL -	if (iowq.busy_poll_to) -		io_blocking_napi_busy_loop(&local_napi_list, &iowq); -	if (!list_empty(&local_napi_list)) -		io_putback_napi_list(ctx, &local_napi_list); -#endif  	do {  		/* if we can't even flush overflow, don't wait for more */  		if (!io_cqring_overflow_flush(ctx)) { @@ -8405,10 +9393,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,  		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,  						TASK_INTERRUPTIBLE);  		ret = io_cqring_wait_schedule(ctx, &iowq, timeout); -		finish_wait(&ctx->cq_wait, &iowq.wq);  		cond_resched();  	} while (ret > 0); +	finish_wait(&ctx->cq_wait, &iowq.wq);  	restore_saved_sigmask_unless(ret == -EINTR);  	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; @@ -8646,17 +9634,57 @@ static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)  {  	table->files = kvcalloc(nr_files, sizeof(table->files[0]),  				GFP_KERNEL_ACCOUNT); -	return !!table->files; +	if (unlikely(!table->files)) +		return false; + +	table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); +	if (unlikely(!table->bitmap)) { +		kvfree(table->files); +		return false; +	} + +	return true;  }  static void io_free_file_tables(struct io_file_table *table)  {  	kvfree(table->files); +	bitmap_free(table->bitmap);  	table->files = NULL; +	table->bitmap = NULL; +} + +static inline void io_file_bitmap_set(struct io_file_table *table, int bit) +{ +	WARN_ON_ONCE(test_bit(bit, table->bitmap)); +	__set_bit(bit, table->bitmap); +	if (bit == table->alloc_hint) +		table->alloc_hint++; +} + +static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) +{ +	__clear_bit(bit, table->bitmap); +	table->alloc_hint = bit;  }  static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)  { +#if !defined(IO_URING_SCM_ALL) +	int i; + +	for (i = 0; i < ctx->nr_user_files; i++) { +		struct file *file = io_file_from_index(ctx, i); + +		if (!file) +			continue; +		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) +			continue; +		io_file_bitmap_clear(&ctx->file_table, i); +		fput(file); +	} +#endif +  #if defined(CONFIG_UNIX)  	if (ctx->ring_sock) {  		struct sock *sock = ctx->ring_sock->sk; @@ -8665,16 +9693,6 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)  		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)  			kfree_skb(skb);  	} -#else -	int i; - -	for (i = 0; i < ctx->nr_user_files; i++) { -		struct file *file; - -		file = io_file_from_index(ctx, i); -		if (file) -			fput(file); -	}  #endif  	io_free_file_tables(&ctx->file_table);  	io_rsrc_data_free(ctx->file_data); @@ -8819,103 +9837,66 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,  	return sqd;  } -#if defined(CONFIG_UNIX)  /*   * Ensure the UNIX gc is aware of our file set, so we are certain that   * the io_uring can be safely unregistered on process exit, even if we have - * loops in the file referencing. + * loops in the file referencing. We account only files that can hold other + * files because otherwise they can't form a loop and so are not interesting + * for GC.   */ -static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) +static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)  { +#if defined(CONFIG_UNIX)  	struct sock *sk = ctx->ring_sock->sk; +	struct sk_buff_head *head = &sk->sk_receive_queue;  	struct scm_fp_list *fpl;  	struct sk_buff *skb; -	int i, nr_files; - -	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); -	if (!fpl) -		return -ENOMEM; -	skb = alloc_skb(0, GFP_KERNEL); -	if (!skb) { -		kfree(fpl); -		return -ENOMEM; -	} +	if (likely(!io_file_need_scm(file))) +		return 0; -	skb->sk = sk; +	/* +	 * See if we can merge this file into an existing skb SCM_RIGHTS +	 * file set. If there's no room, fall back to allocating a new skb +	 * and filling it in. +	 */ +	spin_lock_irq(&head->lock); +	skb = skb_peek(head); +	if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) +		__skb_unlink(skb, head); +	else +		skb = NULL; +	spin_unlock_irq(&head->lock); -	nr_files = 0; -	fpl->user = get_uid(current_user()); -	for (i = 0; i < nr; i++) { -		struct file *file = io_file_from_index(ctx, i + offset); +	if (!skb) { +		fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); +		if (!fpl) +			return -ENOMEM; -		if (!file) -			continue; -		fpl->fp[nr_files] = get_file(file); -		unix_inflight(fpl->user, fpl->fp[nr_files]); -		nr_files++; -	} +		skb = alloc_skb(0, GFP_KERNEL); +		if (!skb) { +			kfree(fpl); +			return -ENOMEM; +		} -	if (nr_files) { +		fpl->user = get_uid(current_user());  		fpl->max = SCM_MAX_FD; -		fpl->count = nr_files; +		fpl->count = 0; +  		UNIXCB(skb).fp = fpl; +		skb->sk = sk;  		skb->destructor = unix_destruct_scm;  		refcount_add(skb->truesize, &sk->sk_wmem_alloc); -		skb_queue_head(&sk->sk_receive_queue, skb); - -		for (i = 0; i < nr_files; i++) -			fput(fpl->fp[i]); -	} else { -		kfree_skb(skb); -		free_uid(fpl->user); -		kfree(fpl); -	} - -	return 0; -} - -/* - * If UNIX sockets are enabled, fd passing can cause a reference cycle which - * causes regular reference counting to break down. We rely on the UNIX - * garbage collection to take care of this problem for us. - */ -static int io_sqe_files_scm(struct io_ring_ctx *ctx) -{ -	unsigned left, total; -	int ret = 0; - -	total = 0; -	left = ctx->nr_user_files; -	while (left) { -		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); - -		ret = __io_sqe_files_scm(ctx, this_files, total); -		if (ret) -			break; -		left -= this_files; -		total += this_files; -	} - -	if (!ret) -		return 0; - -	while (total < ctx->nr_user_files) { -		struct file *file = io_file_from_index(ctx, total); - -		if (file) -			fput(file); -		total++;  	} -	return ret; -} -#else -static int io_sqe_files_scm(struct io_ring_ctx *ctx) -{ +	fpl = UNIXCB(skb).fp; +	fpl->fp[fpl->count++] = get_file(file); +	unix_inflight(fpl->user, file); +	skb_queue_head(head, skb); +	fput(file); +#endif  	return 0;  } -#endif  static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)  { @@ -8926,6 +9907,11 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)  	struct sk_buff *skb;  	int i; +	if (!io_file_need_scm(file)) { +		fput(file); +		return; +	} +  	__skb_queue_head_init(&list);  	/* @@ -8990,15 +9976,17 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)  		list_del(&prsrc->list);  		if (prsrc->tag) { -			bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; +			if (ctx->flags & IORING_SETUP_IOPOLL) +				mutex_lock(&ctx->uring_lock); -			io_ring_submit_lock(ctx, lock_ring);  			spin_lock(&ctx->completion_lock);  			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);  			io_commit_cqring(ctx);  			spin_unlock(&ctx->completion_lock);  			io_cqring_ev_posted(ctx); -			io_ring_submit_unlock(ctx, lock_ring); + +			if (ctx->flags & IORING_SETUP_IOPOLL) +				mutex_unlock(&ctx->uring_lock);  		}  		rsrc_data->do_put(ctx, prsrc); @@ -9052,27 +10040,31 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,  	if (ret)  		return ret; -	ret = -ENOMEM; -	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) -		goto out_free; +	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { +		io_rsrc_data_free(ctx->file_data); +		ctx->file_data = NULL; +		return -ENOMEM; +	}  	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { -		if (copy_from_user(&fd, &fds[i], sizeof(fd))) { +		struct io_fixed_file *file_slot; + +		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {  			ret = -EFAULT; -			goto out_fput; +			goto fail;  		}  		/* allow sparse sets */ -		if (fd == -1) { +		if (!fds || fd == -1) {  			ret = -EINVAL;  			if (unlikely(*io_get_tag_slot(ctx->file_data, i))) -				goto out_fput; +				goto fail;  			continue;  		}  		file = fget(fd);  		ret = -EBADF;  		if (unlikely(!file)) -			goto out_fput; +			goto fail;  		/*  		 * Don't allow io_uring instances to be registered. If UNIX @@ -9083,86 +10075,37 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,  		 */  		if (file->f_op == &io_uring_fops) {  			fput(file); -			goto out_fput; +			goto fail;  		} -		io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); -	} - -	ret = io_sqe_files_scm(ctx); -	if (ret) { -		__io_sqe_files_unregister(ctx); -		return ret; -	} - -	io_rsrc_node_switch(ctx, NULL); -	return ret; -out_fput: -	for (i = 0; i < ctx->nr_user_files; i++) { -		file = io_file_from_index(ctx, i); -		if (file) +		ret = io_scm_file_account(ctx, file); +		if (ret) {  			fput(file); -	} -	io_free_file_tables(&ctx->file_table); -	ctx->nr_user_files = 0; -out_free: -	io_rsrc_data_free(ctx->file_data); -	ctx->file_data = NULL; -	return ret; -} - -static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, -				int index) -{ -#if defined(CONFIG_UNIX) -	struct sock *sock = ctx->ring_sock->sk; -	struct sk_buff_head *head = &sock->sk_receive_queue; -	struct sk_buff *skb; - -	/* -	 * See if we can merge this file into an existing skb SCM_RIGHTS -	 * file set. If there's no room, fall back to allocating a new skb -	 * and filling it in. -	 */ -	spin_lock_irq(&head->lock); -	skb = skb_peek(head); -	if (skb) { -		struct scm_fp_list *fpl = UNIXCB(skb).fp; - -		if (fpl->count < SCM_MAX_FD) { -			__skb_unlink(skb, head); -			spin_unlock_irq(&head->lock); -			fpl->fp[fpl->count] = get_file(file); -			unix_inflight(fpl->user, fpl->fp[fpl->count]); -			fpl->count++; -			spin_lock_irq(&head->lock); -			__skb_queue_head(head, skb); -		} else { -			skb = NULL; +			goto fail;  		} -	} -	spin_unlock_irq(&head->lock); - -	if (skb) { -		fput(file); -		return 0; +		file_slot = io_fixed_file_slot(&ctx->file_table, i); +		io_fixed_file_set(file_slot, file); +		io_file_bitmap_set(&ctx->file_table, i);  	} -	return __io_sqe_files_scm(ctx, 1, index); -#else +	io_rsrc_node_switch(ctx, NULL);  	return 0; -#endif +fail: +	__io_sqe_files_unregister(ctx); +	return ret;  }  static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,  				 struct io_rsrc_node *node, void *rsrc)  { +	u64 *tag_slot = io_get_tag_slot(data, idx);  	struct io_rsrc_put *prsrc;  	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);  	if (!prsrc)  		return -ENOMEM; -	prsrc->tag = *io_get_tag_slot(data, idx); +	prsrc->tag = *tag_slot; +	*tag_slot = 0;  	prsrc->rsrc = rsrc;  	list_add(&prsrc->list, &node->rsrc_list);  	return 0; @@ -9172,12 +10115,11 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,  				 unsigned int issue_flags, u32 slot_index)  {  	struct io_ring_ctx *ctx = req->ctx; -	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;  	bool needs_switch = false;  	struct io_fixed_file *file_slot;  	int ret = -EBADF; -	io_ring_submit_lock(ctx, needs_lock); +	io_ring_submit_lock(ctx, issue_flags);  	if (file->f_op == &io_uring_fops)  		goto err;  	ret = -ENXIO; @@ -9203,22 +10145,20 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,  		if (ret)  			goto err;  		file_slot->file_ptr = 0; +		io_file_bitmap_clear(&ctx->file_table, slot_index);  		needs_switch = true;  	} -	*io_get_tag_slot(ctx->file_data, slot_index) = 0; -	io_fixed_file_set(file_slot, file); -	ret = io_sqe_file_register(ctx, file, slot_index); -	if (ret) { -		file_slot->file_ptr = 0; -		goto err; +	ret = io_scm_file_account(ctx, file); +	if (!ret) { +		*io_get_tag_slot(ctx->file_data, slot_index) = 0; +		io_fixed_file_set(file_slot, file); +		io_file_bitmap_set(&ctx->file_table, slot_index);  	} - -	ret = 0;  err:  	if (needs_switch)  		io_rsrc_node_switch(ctx, ctx->file_data); -	io_ring_submit_unlock(ctx, needs_lock); +	io_ring_submit_unlock(ctx, issue_flags);  	if (ret)  		fput(file);  	return ret; @@ -9228,12 +10168,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)  {  	unsigned int offset = req->close.file_slot - 1;  	struct io_ring_ctx *ctx = req->ctx; -	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;  	struct io_fixed_file *file_slot;  	struct file *file; -	int ret, i; +	int ret; -	io_ring_submit_lock(ctx, needs_lock); +	io_ring_submit_lock(ctx, issue_flags);  	ret = -ENXIO;  	if (unlikely(!ctx->file_data))  		goto out; @@ -9244,8 +10183,8 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)  	if (ret)  		goto out; -	i = array_index_nospec(offset, ctx->nr_user_files); -	file_slot = io_fixed_file_slot(&ctx->file_table, i); +	offset = array_index_nospec(offset, ctx->nr_user_files); +	file_slot = io_fixed_file_slot(&ctx->file_table, offset);  	ret = -EBADF;  	if (!file_slot->file_ptr)  		goto out; @@ -9256,10 +10195,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)  		goto out;  	file_slot->file_ptr = 0; +	io_file_bitmap_clear(&ctx->file_table, offset);  	io_rsrc_node_switch(ctx, ctx->file_data);  	ret = 0;  out: -	io_ring_submit_unlock(ctx, needs_lock); +	io_ring_submit_unlock(ctx, issue_flags);  	return ret;  } @@ -9301,11 +10241,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,  		if (file_slot->file_ptr) {  			file = (struct file *)(file_slot->file_ptr & FFS_MASK); -			err = io_queue_rsrc_removal(data, up->offset + done, -						    ctx->rsrc_node, file); +			err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);  			if (err)  				break;  			file_slot->file_ptr = 0; +			io_file_bitmap_clear(&ctx->file_table, i);  			needs_switch = true;  		}  		if (fd != -1) { @@ -9327,14 +10267,14 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,  				err = -EBADF;  				break;  			} -			*io_get_tag_slot(data, up->offset + done) = tag; -			io_fixed_file_set(file_slot, file); -			err = io_sqe_file_register(ctx, file, i); +			err = io_scm_file_account(ctx, file);  			if (err) { -				file_slot->file_ptr = 0;  				fput(file);  				break;  			} +			*io_get_tag_slot(data, i) = tag; +			io_fixed_file_set(file_slot, file); +			io_file_bitmap_set(&ctx->file_table, i);  		}  	} @@ -9411,11 +10351,10 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,  	xa_init(&tctx->xa);  	init_waitqueue_head(&tctx->wait);  	atomic_set(&tctx->in_idle, 0); -	atomic_set(&tctx->inflight_tracked, 0);  	task->io_uring = tctx;  	spin_lock_init(&tctx->task_lock);  	INIT_WQ_LIST(&tctx->task_list); -	INIT_WQ_LIST(&tctx->prior_task_list); +	INIT_WQ_LIST(&tctx->prio_task_list);  	init_task_work(&tctx->task_work, tctx_task_work);  	return 0;  } @@ -9593,8 +10532,8 @@ static void *io_mem_alloc(size_t size)  	return (void *) __get_free_pages(gfp, get_order(size));  } -static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, -				size_t *sq_offset) +static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, +				unsigned int cq_entries, size_t *sq_offset)  {  	struct io_rings *rings;  	size_t off, sq_array_size; @@ -9602,6 +10541,10 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,  	off = struct_size(rings, cqes, cq_entries);  	if (off == SIZE_MAX)  		return SIZE_MAX; +	if (ctx->flags & IORING_SETUP_CQE32) { +		if (check_shl_overflow(off, 1, &off)) +			return SIZE_MAX; +	}  #ifdef CONFIG_SMP  	off = ALIGN(off, SMP_CACHE_BYTES); @@ -9763,30 +10706,18 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,  	return ret;  } -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, -				  struct io_mapped_ubuf **pimu, -				  struct page **last_hpage) +static struct page **io_pin_pages(unsigned long ubuf, unsigned long len, +				  int *npages)  { -	struct io_mapped_ubuf *imu = NULL; +	unsigned long start, end, nr_pages;  	struct vm_area_struct **vmas = NULL;  	struct page **pages = NULL; -	unsigned long off, start, end, ubuf; -	size_t size; -	int ret, pret, nr_pages, i; +	int i, pret, ret = -ENOMEM; -	if (!iov->iov_base) { -		*pimu = ctx->dummy_ubuf; -		return 0; -	} - -	ubuf = (unsigned long) iov->iov_base; -	end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; +	end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;  	start = ubuf >> PAGE_SHIFT;  	nr_pages = end - start; -	*pimu = NULL; -	ret = -ENOMEM; -  	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);  	if (!pages)  		goto done; @@ -9796,10 +10727,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,  	if (!vmas)  		goto done; -	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); -	if (!imu) -		goto done; -  	ret = 0;  	mmap_read_lock(current->mm);  	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, @@ -9817,6 +10744,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,  				break;  			}  		} +		*npages = nr_pages;  	} else {  		ret = pret < 0 ? pret : -EFAULT;  	} @@ -9830,14 +10758,53 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,  			unpin_user_pages(pages, pret);  		goto done;  	} +	ret = 0; +done: +	kvfree(vmas); +	if (ret < 0) { +		kvfree(pages); +		pages = ERR_PTR(ret); +	} +	return pages; +} + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, +				  struct io_mapped_ubuf **pimu, +				  struct page **last_hpage) +{ +	struct io_mapped_ubuf *imu = NULL; +	struct page **pages = NULL; +	unsigned long off; +	size_t size; +	int ret, nr_pages, i; + +	if (!iov->iov_base) { +		*pimu = ctx->dummy_ubuf; +		return 0; +	} + +	*pimu = NULL; +	ret = -ENOMEM; + +	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, +				&nr_pages); +	if (IS_ERR(pages)) { +		ret = PTR_ERR(pages); +		pages = NULL; +		goto done; +	} + +	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); +	if (!imu) +		goto done; -	ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); +	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);  	if (ret) { -		unpin_user_pages(pages, pret); +		unpin_user_pages(pages, nr_pages);  		goto done;  	} -	off = ubuf & ~PAGE_MASK; +	off = (unsigned long) iov->iov_base & ~PAGE_MASK;  	size = iov->iov_len;  	for (i = 0; i < nr_pages; i++) {  		size_t vec_len; @@ -9850,8 +10817,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,  		size -= vec_len;  	}  	/* store original address for later verification */ -	imu->ubuf = ubuf; -	imu->ubuf_end = ubuf + iov->iov_len; +	imu->ubuf = (unsigned long) iov->iov_base; +	imu->ubuf_end = imu->ubuf + iov->iov_len;  	imu->nr_bvecs = nr_pages;  	*pimu = imu;  	ret = 0; @@ -9859,7 +10826,6 @@ done:  	if (ret)  		kvfree(imu);  	kvfree(pages); -	kvfree(vmas);  	return ret;  } @@ -9918,12 +10884,17 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,  	}  	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { -		ret = io_copy_iov(ctx, &iov, arg, i); -		if (ret) -			break; -		ret = io_buffer_validate(&iov); -		if (ret) -			break; +		if (arg) { +			ret = io_copy_iov(ctx, &iov, arg, i); +			if (ret) +				break; +			ret = io_buffer_validate(&iov); +			if (ret) +				break; +		} else { +			memset(&iov, 0, sizeof(iov)); +		} +  		if (!iov.iov_base && *io_get_tag_slot(data, i)) {  			ret = -EINVAL;  			break; @@ -9986,7 +10957,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,  		i = array_index_nospec(offset, ctx->nr_user_bufs);  		if (ctx->user_bufs[i] != ctx->dummy_ubuf) { -			err = io_queue_rsrc_removal(ctx->buf_data, offset, +			err = io_queue_rsrc_removal(ctx->buf_data, i,  						    ctx->rsrc_node, ctx->user_bufs[i]);  			if (unlikely(err)) {  				io_buffer_unmap(ctx, &imu); @@ -10062,19 +11033,19 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)  static void io_destroy_buffers(struct io_ring_ctx *ctx)  { +	struct io_buffer_list *bl; +	unsigned long index;  	int i; -	for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) { -		struct list_head *list = &ctx->io_buffers[i]; - -		while (!list_empty(list)) { -			struct io_buffer_list *bl; +	for (i = 0; i < BGID_ARRAY; i++) { +		if (!ctx->io_bl) +			break; +		__io_remove_buffers(ctx, &ctx->io_bl[i], -1U); +	} -			bl = list_first_entry(list, struct io_buffer_list, list); -			__io_remove_buffers(ctx, bl, -1U); -			list_del(&bl->list); -			kfree(bl); -		} +	xa_for_each(&ctx->io_bl_xa, index, bl) { +		xa_erase(&ctx->io_bl_xa, bl->bgid); +		__io_remove_buffers(ctx, bl, -1U);  	}  	while (!list_empty(&ctx->io_buffers_pages)) { @@ -10094,7 +11065,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)  	mutex_lock(&ctx->uring_lock);  	io_flush_cached_locked_reqs(ctx, state); -	while (state->free_list.next) { +	while (!io_req_cache_empty(ctx)) {  		struct io_wq_work_node *node;  		struct io_kiocb *req; @@ -10181,10 +11152,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)  	io_req_caches_free(ctx);  	if (ctx->hash_map)  		io_wq_put_hash(ctx->hash_map); -	io_free_napi_list(ctx);  	kfree(ctx->cancel_hash);  	kfree(ctx->dummy_ubuf); -	kfree(ctx->io_buffers); +	kfree(ctx->io_bl); +	xa_destroy(&ctx->io_bl_xa);  	kfree(ctx);  } @@ -10215,7 +11186,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)  	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this  	 * pushs them to do the flush.  	 */ -	if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) +	if (io_cqring_events(ctx) || +	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))  		mask |= EPOLLIN | EPOLLRDNORM;  	return mask; @@ -10347,8 +11319,7 @@ static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,  		}  	}  	spin_unlock_irq(&ctx->timeout_lock); -	if (canceled != 0) -		io_commit_cqring(ctx); +	io_commit_cqring(ctx);  	spin_unlock(&ctx->completion_lock);  	if (canceled != 0)  		io_cqring_ev_posted(ctx); @@ -10368,11 +11339,13 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)  		io_unregister_personality(ctx, index);  	mutex_unlock(&ctx->uring_lock); -	io_kill_timeouts(ctx, NULL, true); -	io_poll_remove_all(ctx, NULL, true); - -	/* if we failed setting up the ctx, we might not have any rings */ -	io_iopoll_try_reap_events(ctx); +	/* failed during ring init, it couldn't have issued any requests */ +	if (ctx->rings) { +		io_kill_timeouts(ctx, NULL, true); +		io_poll_remove_all(ctx, NULL, true); +		/* if we failed setting up the ctx, we might not have any rings */ +		io_iopoll_try_reap_events(ctx); +	}  	INIT_WORK(&ctx->exit_work, io_ring_exit_work);  	/* @@ -10464,6 +11437,10 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,  	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };  	struct io_uring_task *tctx = task ? task->io_uring : NULL; +	/* failed during ring init, it couldn't have issued any requests */ +	if (!ctx->rings) +		return; +  	while (1) {  		enum io_wq_cancel cret;  		bool ret = false; @@ -10604,7 +11581,7 @@ static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)  static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)  {  	if (tracked) -		return atomic_read(&tctx->inflight_tracked); +		return 0;  	return percpu_counter_sum(&tctx->inflight);  } @@ -10755,6 +11732,11 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,  			break;  		} +		if (reg.resv) { +			ret = -EINVAL; +			break; +		} +  		if (reg.offset == -1U) {  			start = 0;  			end = IO_RINGFD_REG_MAX; @@ -10801,7 +11783,7 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,  			ret = -EFAULT;  			break;  		} -		if (reg.offset >= IO_RINGFD_REG_MAX) { +		if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {  			ret = -EINVAL;  			break;  		} @@ -10904,6 +11886,19 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)  	return 0;  } +static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) +{ +	if (flags & IORING_ENTER_EXT_ARG) { +		struct io_uring_getevents_arg arg; + +		if (argsz != sizeof(arg)) +			return -EINVAL; +		if (copy_from_user(&arg, argp, sizeof(arg))) +			return -EFAULT; +	} +	return 0; +} +  static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,  			  struct __kernel_timespec __user **ts,  			  const sigset_t __user **sig) @@ -10928,6 +11923,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz  		return -EINVAL;  	if (copy_from_user(&arg, argp, sizeof(arg)))  		return -EFAULT; +	if (arg.pad) +		return -EINVAL;  	*sig = u64_to_user_ptr(arg.sigmask);  	*argsz = arg.sigmask_sz;  	*ts = u64_to_user_ptr(arg.ts); @@ -10939,7 +11936,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,  		size_t, argsz)  {  	struct io_ring_ctx *ctx; -	int submitted = 0;  	struct fd f;  	long ret; @@ -11002,39 +11998,64 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,  			if (ret)  				goto out;  		} -		submitted = to_submit; +		ret = to_submit;  	} else if (to_submit) {  		ret = io_uring_add_tctx_node(ctx);  		if (unlikely(ret))  			goto out; -		mutex_lock(&ctx->uring_lock); -		submitted = io_submit_sqes(ctx, to_submit); -		mutex_unlock(&ctx->uring_lock); -		if (submitted != to_submit) +		mutex_lock(&ctx->uring_lock); +		ret = io_submit_sqes(ctx, to_submit); +		if (ret != to_submit) { +			mutex_unlock(&ctx->uring_lock);  			goto out; +		} +		if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll) +			goto iopoll_locked; +		mutex_unlock(&ctx->uring_lock);  	}  	if (flags & IORING_ENTER_GETEVENTS) { -		const sigset_t __user *sig; -		struct __kernel_timespec __user *ts; +		int ret2; +		if (ctx->syscall_iopoll) { +			/* +			 * We disallow the app entering submit/complete with +			 * polling, but we still need to lock the ring to +			 * prevent racing with polled issue that got punted to +			 * a workqueue. +			 */ +			mutex_lock(&ctx->uring_lock); +iopoll_locked: +			ret2 = io_validate_ext_arg(flags, argp, argsz); +			if (likely(!ret2)) { +				min_complete = min(min_complete, +						   ctx->cq_entries); +				ret2 = io_iopoll_check(ctx, min_complete); +			} +			mutex_unlock(&ctx->uring_lock); +		} else { +			const sigset_t __user *sig; +			struct __kernel_timespec __user *ts; -		ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); -		if (unlikely(ret)) -			goto out; +			ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); +			if (likely(!ret2)) { +				min_complete = min(min_complete, +						   ctx->cq_entries); +				ret2 = io_cqring_wait(ctx, min_complete, sig, +						      argsz, ts); +			} +		} -		min_complete = min(min_complete, ctx->cq_entries); +		if (!ret) { +			ret = ret2; -		/* -		 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user -		 * space applications don't need to do io completion events -		 * polling again, they can rely on io_sq_thread to do polling -		 * work, which can reduce cpu usage and uring_lock contention. -		 */ -		if (ctx->flags & IORING_SETUP_IOPOLL && -		    !(ctx->flags & IORING_SETUP_SQPOLL)) { -			ret = io_iopoll_check(ctx, min_complete); -		} else { -			ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); +			/* +			 * EBADR indicates that one or more CQE were dropped. +			 * Once the user has been informed we can clear the bit +			 * as they are obviously ok with those drops. +			 */ +			if (unlikely(ret2 == -EBADR)) +				clear_bit(IO_CHECK_CQ_DROPPED_BIT, +					  &ctx->check_cq);  		}  	} @@ -11043,7 +12064,7 @@ out:  out_fput:  	if (!(flags & IORING_ENTER_REGISTERED_RING))  		fdput(f); -	return submitted ? submitted : ret; +	return ret;  }  #ifdef CONFIG_PROC_FS @@ -11090,10 +12111,15 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,  	unsigned int sq_tail = READ_ONCE(r->sq.tail);  	unsigned int cq_head = READ_ONCE(r->cq.head);  	unsigned int cq_tail = READ_ONCE(r->cq.tail); +	unsigned int cq_shift = 0;  	unsigned int sq_entries, cq_entries;  	bool has_lock; +	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);  	unsigned int i; +	if (is_cqe32) +		cq_shift = 1; +  	/*  	 * we may get imprecise sqe and cqe info if uring is actively running  	 * since we get cached_sq_head and cached_cq_tail without uring_lock @@ -11126,11 +12152,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,  	cq_entries = min(cq_tail - cq_head, ctx->cq_entries);  	for (i = 0; i < cq_entries; i++) {  		unsigned int entry = i + cq_head; -		struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; +		struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; -		seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", +		if (!is_cqe32) { +			seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",  			   entry & cq_mask, cqe->user_data, cqe->res,  			   cqe->flags); +		} else { +			seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " +				"extra1:%llu, extra2:%llu\n", +				entry & cq_mask, cqe->user_data, cqe->res, +				cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); +		}  	}  	/* @@ -11233,7 +12266,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,  	ctx->sq_entries = p->sq_entries;  	ctx->cq_entries = p->cq_entries; -	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); +	size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);  	if (size == SIZE_MAX)  		return -EOVERFLOW; @@ -11248,7 +12281,10 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,  	rings->sq_ring_entries = p->sq_entries;  	rings->cq_ring_entries = p->cq_entries; -	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); +	if (p->flags & IORING_SETUP_SQE128) +		size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); +	else +		size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);  	if (size == SIZE_MAX) {  		io_mem_free(ctx->rings);  		ctx->rings = NULL; @@ -11360,11 +12396,41 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,  	ctx = io_ring_ctx_alloc(p);  	if (!ctx)  		return -ENOMEM; + +	/* +	 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user +	 * space applications don't need to do io completion events +	 * polling again, they can rely on io_sq_thread to do polling +	 * work, which can reduce cpu usage and uring_lock contention. +	 */ +	if (ctx->flags & IORING_SETUP_IOPOLL && +	    !(ctx->flags & IORING_SETUP_SQPOLL)) +		ctx->syscall_iopoll = 1; +  	ctx->compat = in_compat_syscall();  	if (!capable(CAP_IPC_LOCK))  		ctx->user = get_uid(current_user());  	/* +	 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if +	 * COOP_TASKRUN is set, then IPIs are never needed by the app. +	 */ +	ret = -EINVAL; +	if (ctx->flags & IORING_SETUP_SQPOLL) { +		/* IPI related flags don't make sense with SQPOLL */ +		if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | +				  IORING_SETUP_TASKRUN_FLAG)) +			goto err; +		ctx->notify_method = TWA_SIGNAL_NO_IPI; +	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { +		ctx->notify_method = TWA_SIGNAL_NO_IPI; +	} else { +		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) +			goto err; +		ctx->notify_method = TWA_SIGNAL; +	} + +	/*  	 * This is just grabbed for accounting purposes. When a process exits,  	 * the mm is exited and dropped before the files, hence we need to hang  	 * on to this mm purely for the purposes of being able to unaccount @@ -11409,7 +12475,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,  			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |  			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |  			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | -			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP; +			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | +			IORING_FEAT_LINKED_FILE;  	if (copy_to_user(params, p, sizeof(*p))) {  		ret = -EFAULT; @@ -11460,10 +12527,12 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)  	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |  			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |  			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | -			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL)) +			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | +			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | +			IORING_SETUP_SQE128 | IORING_SETUP_CQE32))  		return -EINVAL; -	return  io_uring_create(entries, &p, params); +	return io_uring_create(entries, &p, params);  }  SYSCALL_DEFINE2(io_uring_setup, u32, entries, @@ -11620,8 +12689,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,  	__u32 tmp;  	int err; -	if (up->resv) -		return -EINVAL;  	if (check_add_overflow(up->offset, nr_args, &tmp))  		return -EOVERFLOW;  	err = io_rsrc_node_switch_start(ctx); @@ -11647,6 +12714,8 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,  	memset(&up, 0, sizeof(up));  	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))  		return -EFAULT; +	if (up.resv || up.resv2) +		return -EINVAL;  	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);  } @@ -11659,7 +12728,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,  		return -EINVAL;  	if (copy_from_user(&up, arg, sizeof(up)))  		return -EFAULT; -	if (!up.nr || up.resv) +	if (!up.nr || up.resv || up.resv2)  		return -EINVAL;  	return __io_register_rsrc_update(ctx, type, &up, up.nr);  } @@ -11676,14 +12745,20 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,  	memset(&rr, 0, sizeof(rr));  	if (copy_from_user(&rr, arg, size))  		return -EFAULT; -	if (!rr.nr || rr.resv || rr.resv2) +	if (!rr.nr || rr.resv2) +		return -EINVAL; +	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)  		return -EINVAL;  	switch (type) {  	case IORING_RSRC_FILE: +		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) +			break;  		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),  					     rr.nr, u64_to_user_ptr(rr.tags));  	case IORING_RSRC_BUFFER: +		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) +			break;  		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),  					       rr.nr, u64_to_user_ptr(rr.tags));  	} @@ -11707,7 +12782,15 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,  	if (len > cpumask_size())  		len = cpumask_size(); -	if (copy_from_user(new_mask, arg, len)) { +	if (in_compat_syscall()) { +		ret = compat_get_bitmap(cpumask_bits(new_mask), +					(const compat_ulong_t __user *)arg, +					len * 8 /* CHAR_BIT */); +	} else { +		ret = copy_from_user(new_mask, arg, len); +	} + +	if (ret) {  		free_cpumask_var(new_mask);  		return -EFAULT;  	} @@ -11810,6 +12893,85 @@ err:  	return ret;  } +static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ +	struct io_uring_buf_ring *br; +	struct io_uring_buf_reg reg; +	struct io_buffer_list *bl; +	struct page **pages; +	int nr_pages; + +	if (copy_from_user(®, arg, sizeof(reg))) +		return -EFAULT; + +	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) +		return -EINVAL; +	if (!reg.ring_addr) +		return -EFAULT; +	if (reg.ring_addr & ~PAGE_MASK) +		return -EINVAL; +	if (!is_power_of_2(reg.ring_entries)) +		return -EINVAL; + +	if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { +		int ret = io_init_bl_list(ctx); +		if (ret) +			return ret; +	} + +	bl = io_buffer_get_list(ctx, reg.bgid); +	if (bl) { +		/* if mapped buffer ring OR classic exists, don't allow */ +		if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) +			return -EEXIST; +	} else { +		bl = kzalloc(sizeof(*bl), GFP_KERNEL); +		if (!bl) +			return -ENOMEM; +	} + +	pages = io_pin_pages(reg.ring_addr, +			     struct_size(br, bufs, reg.ring_entries), +			     &nr_pages); +	if (IS_ERR(pages)) { +		kfree(bl); +		return PTR_ERR(pages); +	} + +	br = page_address(pages[0]); +	bl->buf_pages = pages; +	bl->buf_nr_pages = nr_pages; +	bl->nr_entries = reg.ring_entries; +	bl->buf_ring = br; +	bl->mask = reg.ring_entries - 1; +	io_buffer_add_list(ctx, bl, reg.bgid); +	return 0; +} + +static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ +	struct io_uring_buf_reg reg; +	struct io_buffer_list *bl; + +	if (copy_from_user(®, arg, sizeof(reg))) +		return -EFAULT; +	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) +		return -EINVAL; + +	bl = io_buffer_get_list(ctx, reg.bgid); +	if (!bl) +		return -ENOENT; +	if (!bl->buf_nr_pages) +		return -EINVAL; + +	__io_remove_buffers(ctx, bl, -1U); +	if (bl->bgid >= BGID_ARRAY) { +		xa_erase(&ctx->io_bl_xa, bl->bgid); +		kfree(bl); +	} +	return 0; +} +  static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,  			       void __user *arg, unsigned nr_args)  	__releases(ctx->uring_lock) @@ -11835,6 +12997,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,  	switch (opcode) {  	case IORING_REGISTER_BUFFERS: +		ret = -EFAULT; +		if (!arg) +			break;  		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);  		break;  	case IORING_UNREGISTER_BUFFERS: @@ -11844,6 +13009,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,  		ret = io_sqe_buffers_unregister(ctx);  		break;  	case IORING_REGISTER_FILES: +		ret = -EFAULT; +		if (!arg) +			break;  		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);  		break;  	case IORING_UNREGISTER_FILES: @@ -11938,6 +13106,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,  	case IORING_UNREGISTER_RING_FDS:  		ret = io_ringfd_unregister(ctx, arg, nr_args);  		break; +	case IORING_REGISTER_PBUF_RING: +		ret = -EINVAL; +		if (!arg || nr_args != 1) +			break; +		ret = io_register_pbuf_ring(ctx, arg); +		break; +	case IORING_UNREGISTER_PBUF_RING: +		ret = -EINVAL; +		if (!arg || nr_args != 1) +			break; +		ret = io_unregister_pbuf_ring(ctx, arg); +		break;  	default:  		ret = -EINVAL;  		break; @@ -12014,6 +13194,7 @@ static int __init io_uring_init(void)  	BUILD_BUG_SQE_ELEM(42, __u16,  personality);  	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);  	BUILD_BUG_SQE_ELEM(44, __u32,  file_index); +	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);  	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=  		     sizeof(struct io_uring_rsrc_update)); @@ -12022,6 +13203,10 @@ static int __init io_uring_init(void)  	/* ->buf_index is u16 */  	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); +	BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE); +	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); +	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != +		     offsetof(struct io_uring_buf_ring, tail));  	/* should fit into one byte */  	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); @@ -12031,6 +13216,10 @@ static int __init io_uring_init(void)  	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);  	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); +	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); + +	BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); +  	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |  				SLAB_ACCOUNT);  	return 0; | 
