From 6e6b8c62120a22acd8cb759304e4cd2e3215d488 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Mar 2024 22:00:28 +0000
Subject: io_uring/rw: avoid punting to io-wq directly

kiocb_done() should care to specifically redirecting requests to io-wq.
Remove the hopping to tw to then queue an io-wq, return -EAGAIN and let
the core code io_uring handle offloading.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/413564e550fe23744a970e1783dfa566291b0e6f.1710799188.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'io_uring/rw.c')

diff --git a/io_uring/rw.c b/io_uring/rw.c
index c8d48287439e..0afe4f9e0e3f 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -187,12 +187,6 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 	return NULL;
 }
 
-static void io_req_task_queue_reissue(struct io_kiocb *req)
-{
-	req->io_task_work.func = io_queue_iowq;
-	io_req_task_work_add(req);
-}
-
 #ifdef CONFIG_BLOCK
 static bool io_resubmit_prep(struct io_kiocb *req)
 {
@@ -405,7 +399,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 	if (req->flags & REQ_F_REISSUE) {
 		req->flags &= ~REQ_F_REISSUE;
 		if (io_resubmit_prep(req))
-			io_req_task_queue_reissue(req);
+			return -EAGAIN;
 		else
 			io_req_task_queue_fail(req, final_ret);
 	}
-- 
cgit v1.2.3-70-g09d2


From 8e5b3b89ecaf6d9295e561c225b35c574a5e0fe7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Mar 2024 22:00:30 +0000
Subject: io_uring: remove struct io_tw_state::locked

ctx is always locked for task_work now, so get rid of struct
io_tw_state::locked. Note I'm stopping one step before removing
io_tw_state altogether, which is not empty, because it still serves the
purpose of indicating which function is a tw callback and forcing users
not to invoke them carelessly out of a wrong context. The removal can
always be done later.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/e95e1ea116d0bfa54b656076e6a977bc221392a4.1710799188.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 --
 io_uring/io_uring.c            | 31 ++++++++-----------------------
 io_uring/io_uring.h            |  5 +----
 io_uring/poll.c                |  2 +-
 io_uring/rw.c                  |  6 ++----
 io_uring/timeout.c             |  8 ++------
 io_uring/uring_cmd.c           |  8 ++------
 io_uring/waitid.c              |  2 +-
 8 files changed, 17 insertions(+), 47 deletions(-)

(limited to 'io_uring/rw.c')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index ac333ea81d31..90e5af401800 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -438,8 +438,6 @@ struct io_ring_ctx {
 };
 
 struct io_tw_state {
-	/* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */
-	bool locked;
 };
 
 enum {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 13fb5958454f..ecb59bb2418b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -247,14 +247,12 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 						fallback_work.work);
 	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
 	struct io_kiocb *req, *tmp;
-	struct io_tw_state ts = { .locked = true, };
+	struct io_tw_state ts = {};
 
 	percpu_ref_get(&ctx->refs);
 	mutex_lock(&ctx->uring_lock);
 	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
 		req->io_task_work.func(req, &ts);
-	if (WARN_ON_ONCE(!ts.locked))
-		return;
 	io_submit_flush_completions(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	percpu_ref_put(&ctx->refs);
@@ -1157,11 +1155,9 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 		return;
 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-	if (ts->locked) {
-		io_submit_flush_completions(ctx);
-		mutex_unlock(&ctx->uring_lock);
-		ts->locked = false;
-	}
+
+	io_submit_flush_completions(ctx);
+	mutex_unlock(&ctx->uring_lock);
 	percpu_ref_put(&ctx->refs);
 }
 
@@ -1185,8 +1181,6 @@ struct llist_node *io_handle_tw_list(struct llist_node *node,
 		if (req->ctx != ctx) {
 			ctx_flush_and_put(ctx, &ts);
 			ctx = req->ctx;
-
-			ts.locked = true;
 			mutex_lock(&ctx->uring_lock);
 			percpu_ref_get(&ctx->refs);
 		}
@@ -1459,22 +1453,16 @@ again:
 static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
 					   int min_events)
 {
-	struct io_tw_state ts = { .locked = true, };
-	int ret;
+	struct io_tw_state ts = {};
 
 	if (llist_empty(&ctx->work_llist))
 		return 0;
-
-	ret = __io_run_local_work(ctx, &ts, min_events);
-	/* shouldn't happen! */
-	if (WARN_ON_ONCE(!ts.locked))
-		mutex_lock(&ctx->uring_lock);
-	return ret;
+	return __io_run_local_work(ctx, &ts, min_events);
 }
 
 static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
 {
-	struct io_tw_state ts = { .locked = true };
+	struct io_tw_state ts = {};
 	int ret;
 
 	mutex_lock(&ctx->uring_lock);
@@ -1702,10 +1690,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 
 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	if (ts->locked)
-		io_req_complete_defer(req);
-	else
-		io_req_complete_post(req, IO_URING_F_UNLOCKED);
+	io_req_complete_defer(req);
 }
 
 /*
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 0861d49e83de..7f921daae9c3 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -351,10 +351,7 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
 
 static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 {
-	if (!ts->locked) {
-		mutex_lock(&ctx->uring_lock);
-		ts->locked = true;
-	}
+	lockdep_assert_held(&ctx->uring_lock);
 }
 
 /*
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 6db1dcb2c797..8901dd118e50 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -322,7 +322,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
 			__poll_t mask = mangle_poll(req->cqe.res &
 						    req->apoll_events);
 
-			if (!io_fill_cqe_req_aux(req, ts->locked, mask,
+			if (!io_fill_cqe_req_aux(req, true, mask,
 						 IORING_CQE_F_MORE)) {
 				io_req_set_res(req, mask, 0);
 				return IOU_POLL_REMOVE_POLL_USE_RES;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 0afe4f9e0e3f..674581bda8d7 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -305,11 +305,9 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
 
 	io_req_io_end(req);
 
-	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
-		unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
+	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
+		req->cqe.flags |= io_put_kbuf(req, 0);
 
-		req->cqe.flags |= io_put_kbuf(req, issue_flags);
-	}
 	io_req_task_complete(req, ts);
 }
 
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 7fd7dbb211d6..0a48e6acd0b2 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -72,10 +72,7 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (!io_timeout_finish(timeout, data)) {
-		bool filled;
-		filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME,
-					     IORING_CQE_F_MORE);
-		if (filled) {
+		if (io_fill_cqe_req_aux(req, true, -ETIME, IORING_CQE_F_MORE)) {
 			/* re-arm timer */
 			spin_lock_irq(&ctx->timeout_lock);
 			list_add(&timeout->list, ctx->timeout_list.prev);
@@ -301,7 +298,6 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 
 static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_kiocb *prev = timeout->prev;
 	int ret = -ENOENT;
@@ -313,7 +309,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t
 				.data		= prev->cqe.user_data,
 			};
 
-			ret = io_try_cancel(req->task->io_uring, &cd, issue_flags);
+			ret = io_try_cancel(req->task->io_uring, &cd, 0);
 		}
 		io_req_set_res(req, ret ?: -ETIME, 0);
 		io_req_task_complete(req, ts);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 759f919b14a9..4614ce734fee 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -87,13 +87,9 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
 static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-	unsigned issue_flags = IO_URING_F_UNLOCKED;
 
-	/* locked task_work executor checks the deffered list completion */
-	if (ts->locked)
-		issue_flags = IO_URING_F_COMPLETE_DEFER;
-
-	ioucmd->task_work_cb(ioucmd, issue_flags);
+	/* task_work executor checks the deffered list completion */
+	ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER);
 }
 
 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index 77d340666cb9..6362ec20abc0 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -118,7 +118,7 @@ static int io_waitid_finish(struct io_kiocb *req, int ret)
 static void io_waitid_complete(struct io_kiocb *req, int ret)
 {
 	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
-	struct io_tw_state ts = { .locked = true };
+	struct io_tw_state ts = {};
 
 	/* anyone completing better be holding a reference */
 	WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
-- 
cgit v1.2.3-70-g09d2


From e5c12945be5016d681ff305ea7306fef5902219d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Mar 2024 22:00:31 +0000
Subject: io_uring: refactor io_fill_cqe_req_aux

The restriction on multishot execution context disallowing io-wq is
driven by rules of io_fill_cqe_req_aux(), it should only be called in
the master task context, either from the syscall path or in task_work.
Since task_work now always takes the ctx lock implying
IO_URING_F_COMPLETE_DEFER, we can just assume that the function is
always called with its defer argument set to true.

Kill the argument. Also rename the function for more consistency as
"fill" in CQE related functions was usually meant for raw interfaces
only copying data into the CQ without any locking, waking the user
and other accounting "post" functions take care of.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/93423d106c33116c7d06bf277f651aa68b427328.1710799188.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 16 +++-------------
 io_uring/io_uring.h |  2 +-
 io_uring/net.c      |  6 ++----
 io_uring/poll.c     |  3 +--
 io_uring/rw.c       |  4 +---
 io_uring/timeout.c  |  2 +-
 6 files changed, 9 insertions(+), 24 deletions(-)

(limited to 'io_uring/rw.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ecb59bb2418b..9ae5e7db7c6a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -907,40 +907,30 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
 	state->cqes_count = 0;
 }
 
-static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
-			      bool allow_overflow)
+bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 {
 	bool filled;
 
 	io_cq_lock(ctx);
 	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
-	if (!filled && allow_overflow)
+	if (!filled)
 		filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
 
 	io_cq_unlock_post(ctx);
 	return filled;
 }
 
-bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
-{
-	return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
-}
-
 /*
  * A helper for multishot requests posting additional CQEs.
  * Should only be used from a task_work including IO_URING_F_MULTISHOT.
  */
-bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
+bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	u64 user_data = req->cqe.user_data;
 	struct io_uring_cqe *cqe;
 
 	lockdep_assert(!io_wq_current_is_worker());
-
-	if (!defer)
-		return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
-
 	lockdep_assert_held(&ctx->uring_lock);
 
 	if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 7f921daae9c3..460290e1bdec 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -67,7 +67,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
-bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags);
+bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
diff --git a/io_uring/net.c b/io_uring/net.c
index 4afb475d4197..48bf5ca1a671 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -706,8 +706,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 	 * receive from this socket.
 	 */
 	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
-	    io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
-				*ret, cflags | IORING_CQE_F_MORE)) {
+	    io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
 		struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 		int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
 
@@ -1429,8 +1428,7 @@ retry:
 
 	if (ret < 0)
 		return ret;
-	if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
-				ret, IORING_CQE_F_MORE))
+	if (io_req_post_cqe(req, ret, IORING_CQE_F_MORE))
 		goto retry;
 
 	io_req_set_res(req, ret, 0);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8901dd118e50..5d55bbf1de15 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -322,8 +322,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
 			__poll_t mask = mangle_poll(req->cqe.res &
 						    req->apoll_events);
 
-			if (!io_fill_cqe_req_aux(req, true, mask,
-						 IORING_CQE_F_MORE)) {
+			if (!io_req_post_cqe(req, mask, IORING_CQE_F_MORE)) {
 				io_req_set_res(req, mask, 0);
 				return IOU_POLL_REMOVE_POLL_USE_RES;
 			}
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 674581bda8d7..b5165d1c9686 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -962,9 +962,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
 		cflags = io_put_kbuf(req, issue_flags);
 		rw->len = 0; /* similarly to above, reset len to 0 */
 
-		if (io_fill_cqe_req_aux(req,
-					issue_flags & IO_URING_F_COMPLETE_DEFER,
-					ret, cflags | IORING_CQE_F_MORE)) {
+		if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
 			if (issue_flags & IO_URING_F_MULTISHOT) {
 				/*
 				 * Force retry, as we might have more data to
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 0a48e6acd0b2..3458ca550b83 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -72,7 +72,7 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (!io_timeout_finish(timeout, data)) {
-		if (io_fill_cqe_req_aux(req, true, -ETIME, IORING_CQE_F_MORE)) {
+		if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) {
 			/* re-arm timer */
 			spin_lock_irq(&ctx->timeout_lock);
 			list_add(&timeout->list, ctx->timeout_list.prev);
-- 
cgit v1.2.3-70-g09d2


From a9165b83c1937eeed1f0c731468216d6371d647f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Mar 2024 16:13:01 -0600
Subject: io_uring/rw: always setup io_async_rw for read/write requests

read/write requests try to put everything on the stack, and then alloc
and copy if a retry is needed. This necessitates a bunch of nasty code
that deals with intermediate state.

Get rid of this, and have the prep side setup everything that is needed
upfront, which greatly simplifies the opcode handlers.

This includes adding an alloc cache for io_async_rw, to make it cheap
to handle.

In terms of cost, this should be basically free and transparent. For
the worst case of {READ,WRITE}_FIXED which didn't need it before,
performance is unaffected in the normal peak workload that is being
used to test that. Still runs at 122M IOPS.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   1 +
 io_uring/io_uring.c            |   3 +
 io_uring/opdef.c               |  15 +-
 io_uring/rw.c                  | 538 +++++++++++++++++++----------------------
 io_uring/rw.h                  |  19 +-
 5 files changed, 278 insertions(+), 298 deletions(-)

(limited to 'io_uring/rw.c')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 75b46119d4c8..60d7e35fc303 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -300,6 +300,7 @@ struct io_ring_ctx {
 		struct io_hash_table	cancel_table_locked;
 		struct io_alloc_cache	apoll_cache;
 		struct io_alloc_cache	netmsg_cache;
+		struct io_alloc_cache	rw_cache;
 
 		/*
 		 * Any cancelable uring_cmd is added to this list in
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e6e7794d497f..b0554d122931 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -311,6 +311,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 			    sizeof(struct async_poll));
 	io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_msghdr));
+	io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
+			    sizeof(struct io_async_rw));
 	io_futex_cache_init(ctx);
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
@@ -2823,6 +2825,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_eventfd_unregister(ctx);
 	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
+	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
 	mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index dd4a1e1425e1..fcae75a08f2c 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -67,7 +67,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
 		.vectored		= 1,
-		.prep			= io_prep_rwv,
+		.prep			= io_prep_readv,
 		.issue			= io_read,
 	},
 	[IORING_OP_WRITEV] = {
@@ -81,7 +81,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
 		.vectored		= 1,
-		.prep			= io_prep_rwv,
+		.prep			= io_prep_writev,
 		.issue			= io_write,
 	},
 	[IORING_OP_FSYNC] = {
@@ -99,7 +99,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.prep			= io_prep_rw_fixed,
+		.prep			= io_prep_read_fixed,
 		.issue			= io_read,
 	},
 	[IORING_OP_WRITE_FIXED] = {
@@ -112,7 +112,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.prep			= io_prep_rw_fixed,
+		.prep			= io_prep_write_fixed,
 		.issue			= io_write,
 	},
 	[IORING_OP_POLL_ADD] = {
@@ -239,7 +239,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.prep			= io_prep_rw,
+		.prep			= io_prep_read,
 		.issue			= io_read,
 	},
 	[IORING_OP_WRITE] = {
@@ -252,7 +252,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.prep			= io_prep_rw,
+		.prep			= io_prep_write,
 		.issue			= io_write,
 	},
 	[IORING_OP_FADVISE] = {
@@ -490,14 +490,12 @@ const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_READV] = {
 		.async_size		= sizeof(struct io_async_rw),
 		.name			= "READV",
-		.prep_async		= io_readv_prep_async,
 		.cleanup		= io_readv_writev_cleanup,
 		.fail			= io_rw_fail,
 	},
 	[IORING_OP_WRITEV] = {
 		.async_size		= sizeof(struct io_async_rw),
 		.name			= "WRITEV",
-		.prep_async		= io_writev_prep_async,
 		.cleanup		= io_readv_writev_cleanup,
 		.fail			= io_rw_fail,
 	},
@@ -699,6 +697,7 @@ const struct io_cold_def io_cold_defs[] = {
 #endif
 	},
 	[IORING_OP_READ_MULTISHOT] = {
+		.async_size		= sizeof(struct io_async_rw),
 		.name			= "READ_MULTISHOT",
 	},
 	[IORING_OP_WAITID] = {
diff --git a/io_uring/rw.c b/io_uring/rw.c
index b5165d1c9686..2625e177c9bc 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -75,7 +75,153 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req)
 	return 0;
 }
 
-int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int __io_import_iovec(int ddir, struct io_kiocb *req,
+			     struct io_async_rw *io,
+			     unsigned int issue_flags)
+{
+	const struct io_issue_def *def = &io_issue_defs[req->opcode];
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	void __user *buf;
+	size_t sqe_len;
+
+	buf = u64_to_user_ptr(rw->addr);
+	sqe_len = rw->len;
+
+	if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) {
+		if (io_do_buffer_select(req)) {
+			buf = io_buffer_select(req, &sqe_len, issue_flags);
+			if (!buf)
+				return -ENOBUFS;
+			rw->addr = (unsigned long) buf;
+			rw->len = sqe_len;
+		}
+
+		return import_ubuf(ddir, buf, sqe_len, &io->s.iter);
+	}
+
+	io->free_iovec = io->s.fast_iov;
+	return __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &io->free_iovec,
+				&io->s.iter, req->ctx->compat);
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+				  struct io_async_rw *io,
+				  unsigned int issue_flags)
+{
+	int ret;
+
+	ret = __io_import_iovec(rw, req, io, issue_flags);
+	if (unlikely(ret < 0))
+		return ret;
+
+	iov_iter_save_state(&io->s.iter, &io->s.iter_state);
+	return 0;
+}
+
+static void io_rw_iovec_free(struct io_async_rw *rw)
+{
+	if (rw->free_iovec) {
+		kfree(rw->free_iovec);
+		rw->free_iovec = NULL;
+	}
+}
+
+static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_async_rw *rw = req->async_data;
+
+	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
+		io_rw_iovec_free(rw);
+		return;
+	}
+	if (io_alloc_cache_put(&req->ctx->rw_cache, &rw->cache)) {
+		req->async_data = NULL;
+		req->flags &= ~REQ_F_ASYNC_DATA;
+	}
+}
+
+static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
+{
+	/*
+	 * Disable quick recycling for anything that's gone through io-wq.
+	 * In theory, this should be fine to cleanup. However, some read or
+	 * write iter handling touches the iovec AFTER having called into the
+	 * handler, eg to reexpand or revert. This means we can have:
+	 *
+	 * task			io-wq
+	 *   issue
+	 *     punt to io-wq
+	 *			issue
+	 *			  blkdev_write_iter()
+	 *			    ->ki_complete()
+	 *			      io_complete_rw()
+	 *			        queue tw complete
+	 *  run tw
+	 *    req_rw_cleanup
+	 *			iov_iter_count() <- look at iov_iter again
+	 *
+	 * which can lead to a UAF. This is only possible for io-wq offload
+	 * as the cleanup can run in parallel. As io-wq is not the fast path,
+	 * just leave cleanup to the end.
+	 *
+	 * This is really a bug in the core code that does this, any issue
+	 * path should assume that a successful (or -EIOCBQUEUED) return can
+	 * mean that the underlying data can be gone at any time. But that
+	 * should be fixed seperately, and then this check could be killed.
+	 */
+	if (!(req->flags & REQ_F_REFCOUNT)) {
+		req->flags &= ~REQ_F_NEED_CLEANUP;
+		io_rw_recycle(req, issue_flags);
+	}
+}
+
+static int io_rw_alloc_async(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_cache_entry *entry;
+	struct io_async_rw *rw;
+
+	entry = io_alloc_cache_get(&ctx->rw_cache);
+	if (entry) {
+		rw = container_of(entry, struct io_async_rw, cache);
+		req->flags |= REQ_F_ASYNC_DATA;
+		req->async_data = rw;
+		goto done;
+	}
+
+	if (!io_alloc_async_data(req)) {
+		rw = req->async_data;
+done:
+		rw->free_iovec = NULL;
+		rw->bytes_done = 0;
+		return 0;
+	}
+
+	return -ENOMEM;
+}
+
+static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
+{
+	struct io_async_rw *rw;
+	int ret;
+
+	if (io_rw_alloc_async(req))
+		return -ENOMEM;
+
+	if (!do_import || io_do_buffer_select(req))
+		return 0;
+
+	rw = req->async_data;
+	ret = io_import_iovec(ddir, req, rw, 0);
+	if (unlikely(ret < 0))
+		return ret;
+
+	iov_iter_save_state(&rw->s.iter, &rw->s.iter_state);
+	return 0;
+}
+
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      int ddir, bool do_import)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned ioprio;
@@ -100,34 +246,58 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	rw->addr = READ_ONCE(sqe->addr);
 	rw->len = READ_ONCE(sqe->len);
 	rw->flags = READ_ONCE(sqe->rw_flags);
-	return 0;
+	return io_prep_rw_setup(req, ddir, do_import);
 }
 
-int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	return io_prep_rw(req, sqe, ITER_DEST, true);
+}
+
+int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return io_prep_rw(req, sqe, ITER_SOURCE, true);
+}
+
+static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		       int ddir)
+{
+	const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT);
 	int ret;
 
-	ret = io_prep_rw(req, sqe);
+	ret = io_prep_rw(req, sqe, ddir, do_import);
 	if (unlikely(ret))
 		return ret;
+	if (do_import)
+		return 0;
 
 	/*
 	 * Have to do this validation here, as this is in io_read() rw->len
 	 * might have chanaged due to buffer selection
 	 */
-	if (req->flags & REQ_F_BUFFER_SELECT)
-		return io_iov_buffer_select_prep(req);
+	return io_iov_buffer_select_prep(req);
+}
 
-	return 0;
+int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return io_prep_rwv(req, sqe, ITER_DEST);
+}
+
+int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return io_prep_rwv(req, sqe, ITER_SOURCE);
 }
 
-int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			    int ddir)
 {
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_async_rw *io;
 	u16 index;
 	int ret;
 
-	ret = io_prep_rw(req, sqe);
+	ret = io_prep_rw(req, sqe, ddir, false);
 	if (unlikely(ret))
 		return ret;
 
@@ -136,7 +306,21 @@ int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
 	req->imu = ctx->user_bufs[index];
 	io_req_set_rsrc_node(req, ctx, 0);
-	return 0;
+
+	io = req->async_data;
+	ret = io_import_fixed(ddir, &io->s.iter, req->imu, rw->addr, rw->len);
+	iov_iter_save_state(&io->s.iter, &io->s.iter_state);
+	return ret;
+}
+
+int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return io_prep_rw_fixed(req, sqe, ITER_DEST);
+}
+
+int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return io_prep_rw_fixed(req, sqe, ITER_SOURCE);
 }
 
 /*
@@ -152,7 +336,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (!(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
 
-	ret = io_prep_rw(req, sqe);
+	ret = io_prep_rw(req, sqe, ITER_DEST, false);
 	if (unlikely(ret))
 		return ret;
 
@@ -165,9 +349,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 void io_readv_writev_cleanup(struct io_kiocb *req)
 {
-	struct io_async_rw *io = req->async_data;
-
-	kfree(io->free_iovec);
+	io_rw_iovec_free(req->async_data);
 }
 
 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
@@ -188,14 +370,11 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 }
 
 #ifdef CONFIG_BLOCK
-static bool io_resubmit_prep(struct io_kiocb *req)
+static void io_resubmit_prep(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
 
-	if (!req_has_async_data(req))
-		return !io_req_prep_async(req);
 	iov_iter_restore(&io->s.iter, &io->s.iter_state);
-	return true;
 }
 
 static bool io_rw_should_reissue(struct io_kiocb *req)
@@ -224,9 +403,8 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 	return true;
 }
 #else
-static bool io_resubmit_prep(struct io_kiocb *req)
+static void io_resubmit_prep(struct io_kiocb *req)
 {
-	return false;
 }
 static bool io_rw_should_reissue(struct io_kiocb *req)
 {
@@ -308,6 +486,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
 	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
 		req->cqe.flags |= io_put_kbuf(req, 0);
 
+	io_req_rw_cleanup(req, 0);
 	io_req_task_complete(req, ts);
 }
 
@@ -388,6 +567,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 			io_req_io_end(req);
 			io_req_set_res(req, final_ret,
 				       io_put_kbuf(req, issue_flags));
+			io_req_rw_cleanup(req, issue_flags);
 			return IOU_OK;
 		}
 	} else {
@@ -396,71 +576,12 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 
 	if (req->flags & REQ_F_REISSUE) {
 		req->flags &= ~REQ_F_REISSUE;
-		if (io_resubmit_prep(req))
-			return -EAGAIN;
-		else
-			io_req_task_queue_fail(req, final_ret);
+		io_resubmit_prep(req);
+		return -EAGAIN;
 	}
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
-static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
-				       struct io_rw_state *s,
-				       unsigned int issue_flags)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
-	struct iov_iter *iter = &s->iter;
-	u8 opcode = req->opcode;
-	struct iovec *iovec;
-	void __user *buf;
-	size_t sqe_len;
-	ssize_t ret;
-
-	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
-		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
-		if (ret)
-			return ERR_PTR(ret);
-		return NULL;
-	}
-
-	buf = u64_to_user_ptr(rw->addr);
-	sqe_len = rw->len;
-
-	if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
-		if (io_do_buffer_select(req)) {
-			buf = io_buffer_select(req, &sqe_len, issue_flags);
-			if (!buf)
-				return ERR_PTR(-ENOBUFS);
-			rw->addr = (unsigned long) buf;
-			rw->len = sqe_len;
-		}
-
-		ret = import_ubuf(ddir, buf, sqe_len, iter);
-		if (ret)
-			return ERR_PTR(ret);
-		return NULL;
-	}
-
-	iovec = s->fast_iov;
-	ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
-			      req->ctx->compat);
-	if (unlikely(ret < 0))
-		return ERR_PTR(ret);
-	return iovec;
-}
-
-static inline int io_import_iovec(int rw, struct io_kiocb *req,
-				  struct iovec **iovec, struct io_rw_state *s,
-				  unsigned int issue_flags)
-{
-	*iovec = __io_import_iovec(rw, req, s, issue_flags);
-	if (IS_ERR(*iovec))
-		return PTR_ERR(*iovec);
-
-	iov_iter_save_state(&s->iter, &s->iter_state);
-	return 0;
-}
-
 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
 {
 	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
@@ -532,89 +653,6 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
 	return ret;
 }
 
-static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
-			  const struct iovec *fast_iov, struct iov_iter *iter)
-{
-	struct io_async_rw *io = req->async_data;
-
-	memcpy(&io->s.iter, iter, sizeof(*iter));
-	io->free_iovec = iovec;
-	io->bytes_done = 0;
-	/* can only be fixed buffers, no need to do anything */
-	if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter))
-		return;
-	if (!iovec) {
-		unsigned iov_off = 0;
-
-		io->s.iter.__iov = io->s.fast_iov;
-		if (iter->__iov != fast_iov) {
-			iov_off = iter_iov(iter) - fast_iov;
-			io->s.iter.__iov += iov_off;
-		}
-		if (io->s.fast_iov != fast_iov)
-			memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
-			       sizeof(struct iovec) * iter->nr_segs);
-	} else {
-		req->flags |= REQ_F_NEED_CLEANUP;
-	}
-}
-
-static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
-			     struct io_rw_state *s, bool force)
-{
-	if (!force && !io_cold_defs[req->opcode].prep_async)
-		return 0;
-	/* opcode type doesn't need async data */
-	if (!io_cold_defs[req->opcode].async_size)
-		return 0;
-	if (!req_has_async_data(req)) {
-		struct io_async_rw *iorw;
-
-		if (io_alloc_async_data(req)) {
-			kfree(iovec);
-			return -ENOMEM;
-		}
-
-		io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
-		iorw = req->async_data;
-		/* we've copied and mapped the iter, ensure state is saved */
-		iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
-	}
-	return 0;
-}
-
-static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
-{
-	struct io_async_rw *iorw = req->async_data;
-	struct iovec *iov;
-	int ret;
-
-	iorw->bytes_done = 0;
-	iorw->free_iovec = NULL;
-
-	/* submission path, ->uring_lock should already be taken */
-	ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
-	if (unlikely(ret < 0))
-		return ret;
-
-	if (iov) {
-		iorw->free_iovec = iov;
-		req->flags |= REQ_F_NEED_CLEANUP;
-	}
-
-	return 0;
-}
-
-int io_readv_prep_async(struct io_kiocb *req)
-{
-	return io_rw_prep_async(req, ITER_DEST);
-}
-
-int io_writev_prep_async(struct io_kiocb *req)
-{
-	return io_rw_prep_async(req, ITER_SOURCE);
-}
-
 /*
  * This is our waitqueue callback handler, registered through __folio_lock_async()
  * when we initially tried to do the IO with the iocb armed our waitqueue.
@@ -754,54 +792,28 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 
 static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 {
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
-	struct io_rw_state __s, *s = &__s;
-	struct iovec *iovec;
+	struct io_async_rw *io = req->async_data;
 	struct kiocb *kiocb = &rw->kiocb;
-	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-	struct io_async_rw *io;
-	ssize_t ret, ret2;
+	ssize_t ret;
 	loff_t *ppos;
 
-	if (!req_has_async_data(req)) {
-		ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags);
+	if (io_do_buffer_select(req)) {
+		ret = io_import_iovec(ITER_DEST, req, io, issue_flags);
 		if (unlikely(ret < 0))
 			return ret;
-	} else {
-		io = req->async_data;
-		s = &io->s;
-
-		/*
-		 * Safe and required to re-import if we're using provided
-		 * buffers, as we dropped the selected one before retry.
-		 */
-		if (io_do_buffer_select(req)) {
-			ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags);
-			if (unlikely(ret < 0))
-				return ret;
-		}
-
-		/*
-		 * We come here from an earlier attempt, restore our state to
-		 * match in case it doesn't. It's cheap enough that we don't
-		 * need to make this conditional.
-		 */
-		iov_iter_restore(&s->iter, &s->iter_state);
-		iovec = NULL;
 	}
+
 	ret = io_rw_init_file(req, FMODE_READ);
-	if (unlikely(ret)) {
-		kfree(iovec);
+	if (unlikely(ret))
 		return ret;
-	}
-	req->cqe.res = iov_iter_count(&s->iter);
+	req->cqe.res = iov_iter_count(&io->s.iter);
 
 	if (force_nonblock) {
 		/* If the file doesn't support async, just async punt */
-		if (unlikely(!io_file_supports_nowait(req))) {
-			ret = io_setup_async_rw(req, iovec, s, true);
-			return ret ?: -EAGAIN;
-		}
+		if (unlikely(!io_file_supports_nowait(req)))
+			return -EAGAIN;
 		kiocb->ki_flags |= IOCB_NOWAIT;
 	} else {
 		/* Ensure we clear previously set non-block flag */
@@ -811,20 +823,15 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	ppos = io_kiocb_update_pos(req);
 
 	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
-	if (unlikely(ret)) {
-		kfree(iovec);
+	if (unlikely(ret))
 		return ret;
-	}
 
-	ret = io_iter_do_read(rw, &s->iter);
+	ret = io_iter_do_read(rw, &io->s.iter);
 
 	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
 		req->flags &= ~REQ_F_REISSUE;
-		/*
-		 * If we can poll, just do that. For a vectored read, we'll
-		 * need to copy state first.
-		 */
-		if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored)
+		/* If we can poll, just do that. */
+		if (io_file_can_poll(req))
 			return -EAGAIN;
 		/* IOPOLL retry should happen for io-wq threads */
 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -834,8 +841,6 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 			goto done;
 		ret = 0;
 	} else if (ret == -EIOCBQUEUED) {
-		if (iovec)
-			kfree(iovec);
 		return IOU_ISSUE_SKIP_COMPLETE;
 	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
 		   (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
@@ -848,21 +853,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	 * untouched in case of error. Restore it and we'll advance it
 	 * manually if we need to.
 	 */
-	iov_iter_restore(&s->iter, &s->iter_state);
-
-	ret2 = io_setup_async_rw(req, iovec, s, true);
-	iovec = NULL;
-	if (ret2) {
-		ret = ret > 0 ? ret : ret2;
-		goto done;
-	}
-
-	io = req->async_data;
-	s = &io->s;
-	/*
-	 * Now use our persistent iterator and state, if we aren't already.
-	 * We've restored and mapped the iter to match.
-	 */
+	iov_iter_restore(&io->s.iter, &io->s.iter_state);
 
 	do {
 		/*
@@ -870,11 +861,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 		 * above or inside this loop. Advance the iter by the bytes
 		 * that were consumed.
 		 */
-		iov_iter_advance(&s->iter, ret);
-		if (!iov_iter_count(&s->iter))
+		iov_iter_advance(&io->s.iter, ret);
+		if (!iov_iter_count(&io->s.iter))
 			break;
 		io->bytes_done += ret;
-		iov_iter_save_state(&s->iter, &s->iter_state);
+		iov_iter_save_state(&io->s.iter, &io->s.iter_state);
 
 		/* if we can retry, do so with the callbacks armed */
 		if (!io_rw_should_retry(req)) {
@@ -882,24 +873,22 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 			return -EAGAIN;
 		}
 
-		req->cqe.res = iov_iter_count(&s->iter);
+		req->cqe.res = iov_iter_count(&io->s.iter);
 		/*
 		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
 		 * we get -EIOCBQUEUED, then we'll get a notification when the
 		 * desired page gets unlocked. We can also get a partial read
 		 * here, and if we do, then just retry at the new offset.
 		 */
-		ret = io_iter_do_read(rw, &s->iter);
+		ret = io_iter_do_read(rw, &io->s.iter);
 		if (ret == -EIOCBQUEUED)
 			return IOU_ISSUE_SKIP_COMPLETE;
 		/* we got some bytes, but not all. retry. */
 		kiocb->ki_flags &= ~IOCB_WAITQ;
-		iov_iter_restore(&s->iter, &s->iter_state);
+		iov_iter_restore(&io->s.iter, &io->s.iter_state);
 	} while (ret > 0);
 done:
 	/* it's faster to check here then delegate to kfree */
-	if (iovec)
-		kfree(iovec);
 	return ret;
 }
 
@@ -908,8 +897,9 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	ret = __io_read(req, issue_flags);
-	if (ret >= 0)
-		return kiocb_done(req, ret, issue_flags);
+	if (ret >= 0) {
+		ret = kiocb_done(req, ret, issue_flags);
+	}
 
 	return ret;
 }
@@ -981,6 +971,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
 	 * multishot request, hitting overflow will terminate it.
 	 */
 	io_req_set_res(req, ret, cflags);
+	io_req_rw_cleanup(req, issue_flags);
 	if (issue_flags & IO_URING_F_MULTISHOT)
 		return IOU_STOP_MULTISHOT;
 	return IOU_OK;
@@ -988,42 +979,28 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_write(struct io_kiocb *req, unsigned int issue_flags)
 {
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
-	struct io_rw_state __s, *s = &__s;
-	struct iovec *iovec;
+	struct io_async_rw *io = req->async_data;
 	struct kiocb *kiocb = &rw->kiocb;
-	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	ssize_t ret, ret2;
 	loff_t *ppos;
 
-	if (!req_has_async_data(req)) {
-		ret = io_import_iovec(ITER_SOURCE, req, &iovec, s, issue_flags);
-		if (unlikely(ret < 0))
-			return ret;
-	} else {
-		struct io_async_rw *io = req->async_data;
-
-		s = &io->s;
-		iov_iter_restore(&s->iter, &s->iter_state);
-		iovec = NULL;
-	}
 	ret = io_rw_init_file(req, FMODE_WRITE);
-	if (unlikely(ret)) {
-		kfree(iovec);
+	if (unlikely(ret))
 		return ret;
-	}
-	req->cqe.res = iov_iter_count(&s->iter);
+	req->cqe.res = iov_iter_count(&io->s.iter);
 
 	if (force_nonblock) {
 		/* If the file doesn't support async, just async punt */
 		if (unlikely(!io_file_supports_nowait(req)))
-			goto copy_iov;
+			goto ret_eagain;
 
 		/* File path supports NOWAIT for non-direct_IO only for block devices. */
 		if (!(kiocb->ki_flags & IOCB_DIRECT) &&
 			!(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) &&
 			(req->flags & REQ_F_ISREG))
-			goto copy_iov;
+			goto ret_eagain;
 
 		kiocb->ki_flags |= IOCB_NOWAIT;
 	} else {
@@ -1034,19 +1011,17 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	ppos = io_kiocb_update_pos(req);
 
 	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
-	if (unlikely(ret)) {
-		kfree(iovec);
+	if (unlikely(ret))
 		return ret;
-	}
 
 	if (req->flags & REQ_F_ISREG)
 		kiocb_start_write(kiocb);
 	kiocb->ki_flags |= IOCB_WRITE;
 
 	if (likely(req->file->f_op->write_iter))
-		ret2 = call_write_iter(req->file, kiocb, &s->iter);
+		ret2 = call_write_iter(req->file, kiocb, &io->s.iter);
 	else if (req->file->f_op->write)
-		ret2 = loop_rw_iter(WRITE, rw, &s->iter);
+		ret2 = loop_rw_iter(WRITE, rw, &io->s.iter);
 	else
 		ret2 = -EINVAL;
 
@@ -1067,11 +1042,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	if (!force_nonblock || ret2 != -EAGAIN) {
 		/* IOPOLL retry should happen for io-wq threads */
 		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
-			goto copy_iov;
+			goto ret_eagain;
 
 		if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
-			struct io_async_rw *io;
-
 			trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
 						req->cqe.res, ret2);
 
@@ -1080,33 +1053,22 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 			 * in the worker. Also update bytes_done to account for
 			 * the bytes already written.
 			 */
-			iov_iter_save_state(&s->iter, &s->iter_state);
-			ret = io_setup_async_rw(req, iovec, s, true);
-
-			io = req->async_data;
-			if (io)
-				io->bytes_done += ret2;
+			iov_iter_save_state(&io->s.iter, &io->s.iter_state);
+			io->bytes_done += ret2;
 
 			if (kiocb->ki_flags & IOCB_WRITE)
 				io_req_end_write(req);
-			return ret ? ret : -EAGAIN;
+			return -EAGAIN;
 		}
 done:
 		ret = kiocb_done(req, ret2, issue_flags);
 	} else {
-copy_iov:
-		iov_iter_restore(&s->iter, &s->iter_state);
-		ret = io_setup_async_rw(req, iovec, s, false);
-		if (!ret) {
-			if (kiocb->ki_flags & IOCB_WRITE)
-				io_req_end_write(req);
-			return -EAGAIN;
-		}
-		return ret;
+ret_eagain:
+		iov_iter_restore(&io->s.iter, &io->s.iter_state);
+		if (kiocb->ki_flags & IOCB_WRITE)
+			io_req_end_write(req);
+		return -EAGAIN;
 	}
-	/* it's reportedly faster than delegating the null check to kfree() */
-	if (iovec)
-		kfree(iovec);
 	return ret;
 }
 
@@ -1181,6 +1143,8 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 			break;
 		nr_events++;
 		req->cqe.flags = io_put_kbuf(req, 0);
+		if (req->opcode != IORING_OP_URING_CMD)
+			io_req_rw_cleanup(req, 0);
 	}
 	if (unlikely(!nr_events))
 		return 0;
@@ -1194,3 +1158,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 	__io_submit_flush_completions(ctx);
 	return nr_events;
 }
+
+void io_rw_cache_free(struct io_cache_entry *entry)
+{
+	struct io_async_rw *rw;
+
+	rw = container_of(entry, struct io_async_rw, cache);
+	kfree(rw);
+}
diff --git a/io_uring/rw.h b/io_uring/rw.h
index f9e89b4fe4da..f7905070d10b 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -9,21 +9,26 @@ struct io_rw_state {
 };
 
 struct io_async_rw {
+	union {
+		size_t			bytes_done;
+		struct io_cache_entry	cache;
+	};
 	struct io_rw_state		s;
-	const struct iovec		*free_iovec;
-	size_t				bytes_done;
+	struct iovec			*free_iovec;
 	struct wait_page_queue		wpq;
 };
 
-int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_read(struct io_kiocb *req, unsigned int issue_flags);
-int io_readv_prep_async(struct io_kiocb *req);
 int io_write(struct io_kiocb *req, unsigned int issue_flags);
-int io_writev_prep_async(struct io_kiocb *req);
 void io_readv_writev_cleanup(struct io_kiocb *req);
 void io_rw_fail(struct io_kiocb *req);
 void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
 int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);
+void io_rw_cache_free(struct io_cache_entry *entry);
-- 
cgit v1.2.3-70-g09d2


From 0d10bd77a1be0742a12e1bcf0554a4bcbdbc0f35 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Mar 2024 16:25:58 -0600
Subject: io_uring: get rid of struct io_rw_state

A separate state struct is not needed anymore, just fold it in with
io_async_rw.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 45 +++++++++++++++++++++++----------------------
 io_uring/rw.h | 10 +++-------
 2 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'io_uring/rw.c')

diff --git a/io_uring/rw.c b/io_uring/rw.c
index 2625e177c9bc..9743df5617fd 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -96,12 +96,12 @@ static int __io_import_iovec(int ddir, struct io_kiocb *req,
 			rw->len = sqe_len;
 		}
 
-		return import_ubuf(ddir, buf, sqe_len, &io->s.iter);
+		return import_ubuf(ddir, buf, sqe_len, &io->iter);
 	}
 
-	io->free_iovec = io->s.fast_iov;
+	io->free_iovec = io->fast_iov;
 	return __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &io->free_iovec,
-				&io->s.iter, req->ctx->compat);
+				&io->iter, req->ctx->compat);
 }
 
 static inline int io_import_iovec(int rw, struct io_kiocb *req,
@@ -114,7 +114,7 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req,
 	if (unlikely(ret < 0))
 		return ret;
 
-	iov_iter_save_state(&io->s.iter, &io->s.iter_state);
+	iov_iter_save_state(&io->iter, &io->iter_state);
 	return 0;
 }
 
@@ -216,7 +216,7 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
 	if (unlikely(ret < 0))
 		return ret;
 
-	iov_iter_save_state(&rw->s.iter, &rw->s.iter_state);
+	iov_iter_save_state(&rw->iter, &rw->iter_state);
 	return 0;
 }
 
@@ -308,8 +308,8 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	io_req_set_rsrc_node(req, ctx, 0);
 
 	io = req->async_data;
-	ret = io_import_fixed(ddir, &io->s.iter, req->imu, rw->addr, rw->len);
-	iov_iter_save_state(&io->s.iter, &io->s.iter_state);
+	ret = io_import_fixed(ddir, &io->iter, req->imu, rw->addr, rw->len);
+	iov_iter_save_state(&io->iter, &io->iter_state);
 	return ret;
 }
 
@@ -374,7 +374,7 @@ static void io_resubmit_prep(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
 
-	iov_iter_restore(&io->s.iter, &io->s.iter_state);
+	iov_iter_restore(&io->iter, &io->iter_state);
 }
 
 static bool io_rw_should_reissue(struct io_kiocb *req)
@@ -808,7 +808,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	ret = io_rw_init_file(req, FMODE_READ);
 	if (unlikely(ret))
 		return ret;
-	req->cqe.res = iov_iter_count(&io->s.iter);
+	req->cqe.res = iov_iter_count(&io->iter);
 
 	if (force_nonblock) {
 		/* If the file doesn't support async, just async punt */
@@ -826,7 +826,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	if (unlikely(ret))
 		return ret;
 
-	ret = io_iter_do_read(rw, &io->s.iter);
+	ret = io_iter_do_read(rw, &io->iter);
 
 	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
 		req->flags &= ~REQ_F_REISSUE;
@@ -853,7 +853,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	 * untouched in case of error. Restore it and we'll advance it
 	 * manually if we need to.
 	 */
-	iov_iter_restore(&io->s.iter, &io->s.iter_state);
+	iov_iter_restore(&io->iter, &io->iter_state);
 
 	do {
 		/*
@@ -861,11 +861,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 		 * above or inside this loop. Advance the iter by the bytes
 		 * that were consumed.
 		 */
-		iov_iter_advance(&io->s.iter, ret);
-		if (!iov_iter_count(&io->s.iter))
+		iov_iter_advance(&io->iter, ret);
+		if (!iov_iter_count(&io->iter))
 			break;
 		io->bytes_done += ret;
-		iov_iter_save_state(&io->s.iter, &io->s.iter_state);
+		iov_iter_save_state(&io->iter, &io->iter_state);
 
 		/* if we can retry, do so with the callbacks armed */
 		if (!io_rw_should_retry(req)) {
@@ -873,19 +873,19 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 			return -EAGAIN;
 		}
 
-		req->cqe.res = iov_iter_count(&io->s.iter);
+		req->cqe.res = iov_iter_count(&io->iter);
 		/*
 		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
 		 * we get -EIOCBQUEUED, then we'll get a notification when the
 		 * desired page gets unlocked. We can also get a partial read
 		 * here, and if we do, then just retry at the new offset.
 		 */
-		ret = io_iter_do_read(rw, &io->s.iter);
+		ret = io_iter_do_read(rw, &io->iter);
 		if (ret == -EIOCBQUEUED)
 			return IOU_ISSUE_SKIP_COMPLETE;
 		/* we got some bytes, but not all. retry. */
 		kiocb->ki_flags &= ~IOCB_WAITQ;
-		iov_iter_restore(&io->s.iter, &io->s.iter_state);
+		iov_iter_restore(&io->iter, &io->iter_state);
 	} while (ret > 0);
 done:
 	/* it's faster to check here then delegate to kfree */
@@ -989,7 +989,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	ret = io_rw_init_file(req, FMODE_WRITE);
 	if (unlikely(ret))
 		return ret;
-	req->cqe.res = iov_iter_count(&io->s.iter);
+	req->cqe.res = iov_iter_count(&io->iter);
 
 	if (force_nonblock) {
 		/* If the file doesn't support async, just async punt */
@@ -1019,9 +1019,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	kiocb->ki_flags |= IOCB_WRITE;
 
 	if (likely(req->file->f_op->write_iter))
-		ret2 = call_write_iter(req->file, kiocb, &io->s.iter);
+		ret2 = call_write_iter(req->file, kiocb, &io->iter);
 	else if (req->file->f_op->write)
-		ret2 = loop_rw_iter(WRITE, rw, &io->s.iter);
+		ret2 = loop_rw_iter(WRITE, rw, &io->iter);
 	else
 		ret2 = -EINVAL;
 
@@ -1053,7 +1053,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 			 * in the worker. Also update bytes_done to account for
 			 * the bytes already written.
 			 */
-			iov_iter_save_state(&io->s.iter, &io->s.iter_state);
+			iov_iter_save_state(&io->iter, &io->iter_state);
 			io->bytes_done += ret2;
 
 			if (kiocb->ki_flags & IOCB_WRITE)
@@ -1064,7 +1064,7 @@ done:
 		ret = kiocb_done(req, ret2, issue_flags);
 	} else {
 ret_eagain:
-		iov_iter_restore(&io->s.iter, &io->s.iter_state);
+		iov_iter_restore(&io->iter, &io->iter_state);
 		if (kiocb->ki_flags & IOCB_WRITE)
 			io_req_end_write(req);
 		return -EAGAIN;
@@ -1164,5 +1164,6 @@ void io_rw_cache_free(struct io_cache_entry *entry)
 	struct io_async_rw *rw;
 
 	rw = container_of(entry, struct io_async_rw, cache);
+	kfree(rw->free_iovec);
 	kfree(rw);
 }
diff --git a/io_uring/rw.h b/io_uring/rw.h
index f7905070d10b..7824896dc52d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,18 +2,14 @@
 
 #include <linux/pagemap.h>
 
-struct io_rw_state {
-	struct iov_iter			iter;
-	struct iov_iter_state		iter_state;
-	struct iovec			fast_iov[UIO_FASTIOV];
-};
-
 struct io_async_rw {
 	union {
 		size_t			bytes_done;
 		struct io_cache_entry	cache;
 	};
-	struct io_rw_state		s;
+	struct iov_iter			iter;
+	struct iov_iter_state		iter_state;
+	struct iovec			fast_iov[UIO_FASTIOV];
 	struct iovec			*free_iovec;
 	struct wait_page_queue		wpq;
 };
-- 
cgit v1.2.3-70-g09d2


From cca6571381a0bdc88021a1f7a4c2349df21279f7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 22 Mar 2024 20:41:18 -0600
Subject: io_uring/rw: cleanup retry path

We no longer need to gate a potential retry on whether or not the
context matches our original task, as all read/write operations have
been fully prepared upfront. This means there's never any re-import
needed, and hence we can always retry requests.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 35 ++++++++---------------------------
 1 file changed, 8 insertions(+), 27 deletions(-)

(limited to 'io_uring/rw.c')

diff --git a/io_uring/rw.c b/io_uring/rw.c
index 9743df5617fd..ab4454aa8e30 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -369,16 +369,9 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 	return NULL;
 }
 
-#ifdef CONFIG_BLOCK
-static void io_resubmit_prep(struct io_kiocb *req)
-{
-	struct io_async_rw *io = req->async_data;
-
-	iov_iter_restore(&io->iter, &io->iter_state);
-}
-
 static bool io_rw_should_reissue(struct io_kiocb *req)
 {
+#ifdef CONFIG_BLOCK
 	umode_t mode = file_inode(req->file)->i_mode;
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -394,23 +387,11 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 	 */
 	if (percpu_ref_is_dying(&ctx->refs))
 		return false;
-	/*
-	 * Play it safe and assume not safe to re-import and reissue if we're
-	 * not in the original thread group (or in task context).
-	 */
-	if (!same_thread_group(req->task, current) || !in_task())
-		return false;
 	return true;
-}
 #else
-static void io_resubmit_prep(struct io_kiocb *req)
-{
-}
-static bool io_rw_should_reissue(struct io_kiocb *req)
-{
 	return false;
-}
 #endif
+}
 
 static void io_req_end_write(struct io_kiocb *req)
 {
@@ -575,8 +556,10 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 	}
 
 	if (req->flags & REQ_F_REISSUE) {
+		struct io_async_rw *io = req->async_data;
+
 		req->flags &= ~REQ_F_REISSUE;
-		io_resubmit_prep(req);
+		iov_iter_restore(&io->iter, &io->iter_state);
 		return -EAGAIN;
 	}
 	return IOU_ISSUE_SKIP_COMPLETE;
@@ -897,9 +880,8 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	ret = __io_read(req, issue_flags);
-	if (ret >= 0) {
-		ret = kiocb_done(req, ret, issue_flags);
-	}
+	if (ret >= 0)
+		return kiocb_done(req, ret, issue_flags);
 
 	return ret;
 }
@@ -1061,7 +1043,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 			return -EAGAIN;
 		}
 done:
-		ret = kiocb_done(req, ret2, issue_flags);
+		return kiocb_done(req, ret2, issue_flags);
 	} else {
 ret_eagain:
 		iov_iter_restore(&io->iter, &io->iter_state);
@@ -1069,7 +1051,6 @@ ret_eagain:
 			io_req_end_write(req);
 		return -EAGAIN;
 	}
-	return ret;
 }
 
 void io_rw_fail(struct io_kiocb *req)
-- 
cgit v1.2.3-70-g09d2


From d6f911a6b22f8e48aec82cd5f6b5a14dc76a56c3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Mar 2024 16:31:44 -0600
Subject: io_uring/rw: add iovec recycling

Let the io_async_rw hold on to the iovec and reuse it, rather than always
allocate and free them.

Also enables KASAN for the iovec entries, so that reuse can be detected
even while they are in the cache.

While doing so, shrink io_async_rw by getting rid of the bigger embedded
fast iovec. Since iovecs are being recycled now, shrink it from 8 to 1.
This reduces the io_async_rw size from 264 to 160 bytes, a 40% reduction.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 42 +++++++++++++++++++++++++++++++++++++-----
 io_uring/rw.h |  3 ++-
 2 files changed, 39 insertions(+), 6 deletions(-)

(limited to 'io_uring/rw.c')

diff --git a/io_uring/rw.c b/io_uring/rw.c
index ab4454aa8e30..e84d322a6150 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -81,7 +81,9 @@ static int __io_import_iovec(int ddir, struct io_kiocb *req,
 {
 	const struct io_issue_def *def = &io_issue_defs[req->opcode];
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	struct iovec *iov;
 	void __user *buf;
+	int nr_segs, ret;
 	size_t sqe_len;
 
 	buf = u64_to_user_ptr(rw->addr);
@@ -99,9 +101,24 @@ static int __io_import_iovec(int ddir, struct io_kiocb *req,
 		return import_ubuf(ddir, buf, sqe_len, &io->iter);
 	}
 
-	io->free_iovec = io->fast_iov;
-	return __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &io->free_iovec,
-				&io->iter, req->ctx->compat);
+	if (io->free_iovec) {
+		nr_segs = io->free_iov_nr;
+		iov = io->free_iovec;
+	} else {
+		iov = &io->fast_iov;
+		nr_segs = 1;
+	}
+	ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter,
+				req->ctx->compat);
+	if (unlikely(ret < 0))
+		return ret;
+	if (iov) {
+		req->flags |= REQ_F_NEED_CLEANUP;
+		io->free_iov_nr = io->iter.nr_segs;
+		kfree(io->free_iovec);
+		io->free_iovec = iov;
+	}
+	return 0;
 }
 
 static inline int io_import_iovec(int rw, struct io_kiocb *req,
@@ -122,6 +139,7 @@ static void io_rw_iovec_free(struct io_async_rw *rw)
 {
 	if (rw->free_iovec) {
 		kfree(rw->free_iovec);
+		rw->free_iov_nr = 0;
 		rw->free_iovec = NULL;
 	}
 }
@@ -129,12 +147,16 @@ static void io_rw_iovec_free(struct io_async_rw *rw)
 static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_async_rw *rw = req->async_data;
+	struct iovec *iov;
 
 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
 		io_rw_iovec_free(rw);
 		return;
 	}
+	iov = rw->free_iovec;
 	if (io_alloc_cache_put(&req->ctx->rw_cache, &rw->cache)) {
+		if (iov)
+			kasan_mempool_poison_object(iov);
 		req->async_data = NULL;
 		req->flags &= ~REQ_F_ASYNC_DATA;
 	}
@@ -184,6 +206,11 @@ static int io_rw_alloc_async(struct io_kiocb *req)
 	entry = io_alloc_cache_get(&ctx->rw_cache);
 	if (entry) {
 		rw = container_of(entry, struct io_async_rw, cache);
+		if (rw->free_iovec) {
+			kasan_mempool_unpoison_object(rw->free_iovec,
+				rw->free_iov_nr * sizeof(struct iovec));
+			req->flags |= REQ_F_NEED_CLEANUP;
+		}
 		req->flags |= REQ_F_ASYNC_DATA;
 		req->async_data = rw;
 		goto done;
@@ -191,8 +218,9 @@ static int io_rw_alloc_async(struct io_kiocb *req)
 
 	if (!io_alloc_async_data(req)) {
 		rw = req->async_data;
-done:
 		rw->free_iovec = NULL;
+		rw->free_iov_nr = 0;
+done:
 		rw->bytes_done = 0;
 		return 0;
 	}
@@ -1145,6 +1173,10 @@ void io_rw_cache_free(struct io_cache_entry *entry)
 	struct io_async_rw *rw;
 
 	rw = container_of(entry, struct io_async_rw, cache);
-	kfree(rw->free_iovec);
+	if (rw->free_iovec) {
+		kasan_mempool_unpoison_object(rw->free_iovec,
+				rw->free_iov_nr * sizeof(struct iovec));
+		io_rw_iovec_free(rw);
+	}
 	kfree(rw);
 }
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 7824896dc52d..cf51d0eb407a 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -9,8 +9,9 @@ struct io_async_rw {
 	};
 	struct iov_iter			iter;
 	struct iov_iter_state		iter_state;
-	struct iovec			fast_iov[UIO_FASTIOV];
+	struct iovec			fast_iov;
 	struct iovec			*free_iovec;
+	int				free_iov_nr;
 	struct wait_page_queue		wpq;
 };
 
-- 
cgit v1.2.3-70-g09d2


From 414d0f45c316221acbf066658afdbae5b354a5cc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Mar 2024 15:19:44 -0600
Subject: io_uring/alloc_cache: switch to array based caching

Currently lists are being used to manage this, but best practice is
usually to have these in an array instead as that it cheaper to manage.

Outside of that detail, games are also played with KASAN as the list
is inside the cached entry itself.

Finally, all users of this need a struct io_cache_entry embedded in
their struct, which is union'ized with something else in there that
isn't used across the free -> realloc cycle.

Get rid of all of that, and simply have it be an array. This will not
change the memory used, as we're just trading an 8-byte member entry
for the per-elem array size.

This reduces the overhead of the recycled allocations, and it reduces
the amount of code code needed to support recycling to about half of
what it currently is.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/alloc_cache.h         | 57 +++++++++++++++++++-----------------------
 io_uring/futex.c               | 30 +++++++++-------------
 io_uring/futex.h               |  5 ++--
 io_uring/io_uring.c            | 34 ++++++++++++++-----------
 io_uring/net.c                 | 13 ++++------
 io_uring/net.h                 | 18 ++++---------
 io_uring/poll.c                | 12 +++------
 io_uring/poll.h                |  9 +------
 io_uring/rsrc.c                | 10 +++-----
 io_uring/rsrc.h                |  7 +-----
 io_uring/rw.c                  | 14 +++++------
 io_uring/rw.h                  |  7 ++----
 io_uring/uring_cmd.c           | 14 +++--------
 io_uring/uring_cmd.h           |  6 +----
 15 files changed, 93 insertions(+), 145 deletions(-)

(limited to 'io_uring/rw.c')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 0f24fdad19c2..ef45b8bd1b35 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -220,7 +220,7 @@ struct io_ev_fd {
 };
 
 struct io_alloc_cache {
-	struct io_wq_work_node	list;
+	void			**entries;
 	unsigned int		nr_cached;
 	unsigned int		max_cached;
 	size_t			elem_size;
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 138ad14b0b12..b7a38a2069cf 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -6,61 +6,56 @@
  */
 #define IO_ALLOC_CACHE_MAX	128
 
-struct io_cache_entry {
-	struct io_wq_work_node node;
-};
-
 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
-				      struct io_cache_entry *entry)
+				      void *entry)
 {
 	if (cache->nr_cached < cache->max_cached) {
-		cache->nr_cached++;
-		wq_stack_add_head(&entry->node, &cache->list);
-		kasan_mempool_poison_object(entry);
+		if (!kasan_mempool_poison_object(entry))
+			return false;
+		cache->entries[cache->nr_cached++] = entry;
 		return true;
 	}
 	return false;
 }
 
-static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache)
-{
-	return !cache->list.next;
-}
-
-static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
+static inline void *io_alloc_cache_get(struct io_alloc_cache *cache)
 {
-	if (cache->list.next) {
-		struct io_cache_entry *entry;
+	if (cache->nr_cached) {
+		void *entry = cache->entries[--cache->nr_cached];
 
-		entry = container_of(cache->list.next, struct io_cache_entry, node);
 		kasan_mempool_unpoison_object(entry, cache->elem_size);
-		cache->list.next = cache->list.next->next;
-		cache->nr_cached--;
 		return entry;
 	}
 
 	return NULL;
 }
 
-static inline void io_alloc_cache_init(struct io_alloc_cache *cache,
+/* returns false if the cache was initialized properly */
+static inline bool io_alloc_cache_init(struct io_alloc_cache *cache,
 				       unsigned max_nr, size_t size)
 {
-	cache->list.next = NULL;
-	cache->nr_cached = 0;
-	cache->max_cached = max_nr;
-	cache->elem_size = size;
+	cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL);
+	if (cache->entries) {
+		cache->nr_cached = 0;
+		cache->max_cached = max_nr;
+		cache->elem_size = size;
+		return false;
+	}
+	return true;
 }
 
 static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
-					void (*free)(struct io_cache_entry *))
+				       void (*free)(const void *))
 {
-	while (1) {
-		struct io_cache_entry *entry = io_alloc_cache_get(cache);
+	void *entry;
+
+	if (!cache->entries)
+		return;
 
-		if (!entry)
-			break;
+	while ((entry = io_alloc_cache_get(cache)) != NULL)
 		free(entry);
-	}
-	cache->nr_cached = 0;
+
+	kvfree(cache->entries);
+	cache->entries = NULL;
 }
 #endif
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 792a03df58de..914848f46beb 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -9,7 +9,7 @@
 
 #include "../kernel/futex/futex.h"
 #include "io_uring.h"
-#include "rsrc.h"
+#include "alloc_cache.h"
 #include "futex.h"
 
 struct io_futex {
@@ -27,27 +27,21 @@ struct io_futex {
 };
 
 struct io_futex_data {
-	union {
-		struct futex_q		q;
-		struct io_cache_entry	cache;
-	};
+	struct futex_q	q;
 	struct io_kiocb	*req;
 };
 
-void io_futex_cache_init(struct io_ring_ctx *ctx)
-{
-	io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX,
-				sizeof(struct io_futex_data));
-}
+#define IO_FUTEX_ALLOC_CACHE_MAX	32
 
-static void io_futex_cache_entry_free(struct io_cache_entry *entry)
+bool io_futex_cache_init(struct io_ring_ctx *ctx)
 {
-	kfree(container_of(entry, struct io_futex_data, cache));
+	return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX,
+				sizeof(struct io_futex_data));
 }
 
 void io_futex_cache_free(struct io_ring_ctx *ctx)
 {
-	io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free);
+	io_alloc_cache_free(&ctx->futex_cache, kfree);
 }
 
 static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
@@ -63,7 +57,7 @@ static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	io_tw_lock(ctx, ts);
-	if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache))
+	if (!io_alloc_cache_put(&ctx->futex_cache, ifd))
 		kfree(ifd);
 	__io_futex_complete(req, ts);
 }
@@ -259,11 +253,11 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
 
 static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
 {
-	struct io_cache_entry *entry;
+	struct io_futex_data *ifd;
 
-	entry = io_alloc_cache_get(&ctx->futex_cache);
-	if (entry)
-		return container_of(entry, struct io_futex_data, cache);
+	ifd = io_alloc_cache_get(&ctx->futex_cache);
+	if (ifd)
+		return ifd;
 
 	return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
 }
diff --git a/io_uring/futex.h b/io_uring/futex.h
index 0847e9e8a127..b8bb09873d57 100644
--- a/io_uring/futex.h
+++ b/io_uring/futex.h
@@ -13,7 +13,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 		    unsigned int issue_flags);
 bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
 			 bool cancel_all);
-void io_futex_cache_init(struct io_ring_ctx *ctx);
+bool io_futex_cache_init(struct io_ring_ctx *ctx);
 void io_futex_cache_free(struct io_ring_ctx *ctx);
 #else
 static inline int io_futex_cancel(struct io_ring_ctx *ctx,
@@ -27,8 +27,9 @@ static inline bool io_futex_remove_all(struct io_ring_ctx *ctx,
 {
 	return false;
 }
-static inline void io_futex_cache_init(struct io_ring_ctx *ctx)
+static inline bool io_futex_cache_init(struct io_ring_ctx *ctx)
 {
+	return false;
 }
 static inline void io_futex_cache_free(struct io_ring_ctx *ctx)
 {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 579618fad833..1d453eb8e49f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -276,6 +276,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 {
 	struct io_ring_ctx *ctx;
 	int hash_bits;
+	bool ret;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
@@ -305,17 +306,19 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
 	INIT_HLIST_HEAD(&ctx->io_buf_list);
-	io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
+	ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
 			    sizeof(struct io_rsrc_node));
-	io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
+	ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct async_poll));
-	io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
+	ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_msghdr));
-	io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
+	ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_rw));
-	io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
+	ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct uring_cache));
-	io_futex_cache_init(ctx);
+	ret |= io_futex_cache_init(ctx);
+	if (ret)
+		goto err;
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
@@ -345,6 +348,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 
 	return ctx;
 err:
+	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
+	io_alloc_cache_free(&ctx->apoll_cache, kfree);
+	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
+	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
+	io_alloc_cache_free(&ctx->uring_cache, kfree);
+	io_futex_cache_free(ctx);
 	kfree(ctx->cancel_table.hbs);
 	kfree(ctx->cancel_table_locked.hbs);
 	xa_destroy(&ctx->io_bl_xa);
@@ -1482,7 +1491,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 
 				if (apoll->double_poll)
 					kfree(apoll->double_poll);
-				if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache))
+				if (!io_alloc_cache_put(&ctx->apoll_cache, apoll))
 					kfree(apoll);
 				req->flags &= ~REQ_F_POLLED;
 			}
@@ -2778,11 +2787,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	mutex_unlock(&ctx->uring_lock);
 }
 
-static void io_rsrc_node_cache_free(struct io_cache_entry *entry)
-{
-	kfree(container_of(entry, struct io_rsrc_node, cache));
-}
-
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
@@ -2797,10 +2801,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		__io_sqe_files_unregister(ctx);
 	io_cqring_overflow_kill(ctx);
 	io_eventfd_unregister(ctx);
-	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
+	io_alloc_cache_free(&ctx->apoll_cache, kfree);
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
 	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
-	io_alloc_cache_free(&ctx->uring_cache, io_uring_cache_free);
+	io_alloc_cache_free(&ctx->uring_cache, kfree);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
 	mutex_unlock(&ctx->uring_lock);
@@ -2816,7 +2820,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
-	io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
+	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
 	if (ctx->mm_account) {
 		mmdrop(ctx->mm_account);
 		ctx->mm_account = NULL;
diff --git a/io_uring/net.c b/io_uring/net.c
index 3bef562b67de..d0abc5689066 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -137,7 +137,7 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
 
 	/* Let normal cleanup path reap it if we fail adding to the cache */
 	iov = hdr->free_iov;
-	if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) {
+	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
 		if (iov)
 			kasan_mempool_poison_object(iov);
 		req->async_data = NULL;
@@ -148,12 +148,10 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_cache_entry *entry;
 	struct io_async_msghdr *hdr;
 
-	entry = io_alloc_cache_get(&ctx->netmsg_cache);
-	if (entry) {
-		hdr = container_of(entry, struct io_async_msghdr, cache);
+	hdr = io_alloc_cache_get(&ctx->netmsg_cache);
+	if (hdr) {
 		if (hdr->free_iov) {
 			kasan_mempool_unpoison_object(hdr->free_iov,
 				hdr->free_iov_nr * sizeof(struct iovec));
@@ -1490,11 +1488,10 @@ out:
 	return IOU_OK;
 }
 
-void io_netmsg_cache_free(struct io_cache_entry *entry)
+void io_netmsg_cache_free(const void *entry)
 {
-	struct io_async_msghdr *kmsg;
+	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
 
-	kmsg = container_of(entry, struct io_async_msghdr, cache);
 	if (kmsg->free_iov) {
 		kasan_mempool_unpoison_object(kmsg->free_iov,
 				kmsg->free_iov_nr * sizeof(struct iovec));
diff --git a/io_uring/net.h b/io_uring/net.h
index b47b43ec6459..0eb1c1920fc9 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -3,23 +3,15 @@
 #include <linux/net.h>
 #include <linux/uio.h>
 
-#include "alloc_cache.h"
-
 struct io_async_msghdr {
 #if defined(CONFIG_NET)
-	union {
-		struct iovec			fast_iov;
-		struct {
-			struct io_cache_entry	cache;
-			/* entry size of ->free_iov, if valid */
-			int			free_iov_nr;
-		};
-	};
+	struct iovec			fast_iov;
 	/* points to an allocated iov, if NULL we use fast_iov instead */
 	struct iovec			*free_iov;
+	int				free_iov_nr;
+	int				namelen;
 	__kernel_size_t			controllen;
 	__kernel_size_t			payloadlen;
-	int				namelen;
 	struct sockaddr __user		*uaddr;
 	struct msghdr			msg;
 	struct sockaddr_storage		addr;
@@ -57,9 +49,9 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 void io_send_zc_cleanup(struct io_kiocb *req);
 
-void io_netmsg_cache_free(struct io_cache_entry *entry);
+void io_netmsg_cache_free(const void *entry);
 #else
-static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
+static inline void io_netmsg_cache_free(const void *entry)
 {
 }
 #endif
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 5d55bbf1de15..0a8e02944689 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -14,6 +14,7 @@
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
+#include "alloc_cache.h"
 #include "refs.h"
 #include "napi.h"
 #include "opdef.h"
@@ -686,17 +687,15 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
 					     unsigned issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_cache_entry *entry;
 	struct async_poll *apoll;
 
 	if (req->flags & REQ_F_POLLED) {
 		apoll = req->apoll;
 		kfree(apoll->double_poll);
 	} else if (!(issue_flags & IO_URING_F_UNLOCKED)) {
-		entry = io_alloc_cache_get(&ctx->apoll_cache);
-		if (entry == NULL)
+		apoll = io_alloc_cache_get(&ctx->apoll_cache);
+		if (!apoll)
 			goto alloc_apoll;
-		apoll = container_of(entry, struct async_poll, cache);
 		apoll->poll.retries = APOLL_MAX_RETRY;
 	} else {
 alloc_apoll:
@@ -1055,8 +1054,3 @@ out:
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 }
-
-void io_apoll_cache_free(struct io_cache_entry *entry)
-{
-	kfree(container_of(entry, struct async_poll, cache));
-}
diff --git a/io_uring/poll.h b/io_uring/poll.h
index 1dacae9e816c..5c240f11069a 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include "alloc_cache.h"
-
 enum {
 	IO_APOLL_OK,
 	IO_APOLL_ABORTED,
@@ -17,10 +15,7 @@ struct io_poll {
 };
 
 struct async_poll {
-	union {
-		struct io_poll		poll;
-		struct io_cache_entry	cache;
-	};
+	struct io_poll		poll;
 	struct io_poll		*double_poll;
 };
 
@@ -46,6 +41,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			bool cancel_all);
 
-void io_apoll_cache_free(struct io_cache_entry *entry);
-
 void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 4818b79231dd..7b8a056f98ed 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -13,6 +13,7 @@
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
+#include "alloc_cache.h"
 #include "openclose.h"
 #include "rsrc.h"
 
@@ -169,7 +170,7 @@ static void io_rsrc_put_work(struct io_rsrc_node *node)
 
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
-	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache))
+	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
 		kfree(node);
 }
 
@@ -197,12 +198,9 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_rsrc_node *ref_node;
-	struct io_cache_entry *entry;
 
-	entry = io_alloc_cache_get(&ctx->rsrc_node_cache);
-	if (entry) {
-		ref_node = container_of(entry, struct io_rsrc_node, cache);
-	} else {
+	ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
+	if (!ref_node) {
 		ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
 		if (!ref_node)
 			return NULL;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index e21000238954..83c079a707f8 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -2,8 +2,6 @@
 #ifndef IOU_RSRC_H
 #define IOU_RSRC_H
 
-#include "alloc_cache.h"
-
 #define IO_NODE_ALLOC_CACHE_MAX 32
 
 #define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
@@ -36,10 +34,7 @@ struct io_rsrc_data {
 };
 
 struct io_rsrc_node {
-	union {
-		struct io_cache_entry		cache;
-		struct io_ring_ctx		*ctx;
-	};
+	struct io_ring_ctx		*ctx;
 	int				refs;
 	bool				empty;
 	u16				type;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index e84d322a6150..3134a6ece1be 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -18,6 +18,7 @@
 #include "io_uring.h"
 #include "opdef.h"
 #include "kbuf.h"
+#include "alloc_cache.h"
 #include "rsrc.h"
 #include "poll.h"
 #include "rw.h"
@@ -154,7 +155,7 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
 		return;
 	}
 	iov = rw->free_iovec;
-	if (io_alloc_cache_put(&req->ctx->rw_cache, &rw->cache)) {
+	if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) {
 		if (iov)
 			kasan_mempool_poison_object(iov);
 		req->async_data = NULL;
@@ -200,12 +201,10 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
 static int io_rw_alloc_async(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_cache_entry *entry;
 	struct io_async_rw *rw;
 
-	entry = io_alloc_cache_get(&ctx->rw_cache);
-	if (entry) {
-		rw = container_of(entry, struct io_async_rw, cache);
+	rw = io_alloc_cache_get(&ctx->rw_cache);
+	if (rw) {
 		if (rw->free_iovec) {
 			kasan_mempool_unpoison_object(rw->free_iovec,
 				rw->free_iov_nr * sizeof(struct iovec));
@@ -1168,11 +1167,10 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 	return nr_events;
 }
 
-void io_rw_cache_free(struct io_cache_entry *entry)
+void io_rw_cache_free(const void *entry)
 {
-	struct io_async_rw *rw;
+	struct io_async_rw *rw = (struct io_async_rw *) entry;
 
-	rw = container_of(entry, struct io_async_rw, cache);
 	if (rw->free_iovec) {
 		kasan_mempool_unpoison_object(rw->free_iovec,
 				rw->free_iov_nr * sizeof(struct iovec));
diff --git a/io_uring/rw.h b/io_uring/rw.h
index cf51d0eb407a..3f432dc75441 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -3,10 +3,7 @@
 #include <linux/pagemap.h>
 
 struct io_async_rw {
-	union {
-		size_t			bytes_done;
-		struct io_cache_entry	cache;
-	};
+	size_t				bytes_done;
 	struct iov_iter			iter;
 	struct iov_iter_state		iter_state;
 	struct iovec			fast_iov;
@@ -28,4 +25,4 @@ void io_rw_fail(struct io_kiocb *req);
 void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
 int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);
-void io_rw_cache_free(struct io_cache_entry *entry);
+void io_rw_cache_free(const void *entry);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 92346b5d9f5b..334d31dd6628 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -11,18 +11,17 @@
 #include <asm/ioctls.h>
 
 #include "io_uring.h"
+#include "alloc_cache.h"
 #include "rsrc.h"
 #include "uring_cmd.h"
 
 static struct uring_cache *io_uring_async_get(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_cache_entry *entry;
 	struct uring_cache *cache;
 
-	entry = io_alloc_cache_get(&ctx->uring_cache);
-	if (entry) {
-		cache = container_of(entry, struct uring_cache, cache);
+	cache = io_alloc_cache_get(&ctx->uring_cache);
+	if (cache) {
 		req->flags |= REQ_F_ASYNC_DATA;
 		req->async_data = cache;
 		return cache;
@@ -39,7 +38,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (issue_flags & IO_URING_F_UNLOCKED)
 		return;
-	if (io_alloc_cache_put(&req->ctx->uring_cache, &cache->cache)) {
+	if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) {
 		ioucmd->sqe = NULL;
 		req->async_data = NULL;
 		req->flags &= ~REQ_F_ASYNC_DATA;
@@ -354,8 +353,3 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
 #endif
-
-void io_uring_cache_free(struct io_cache_entry *entry)
-{
-	kfree(container_of(entry, struct uring_cache, cache));
-}
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index 477ea8865639..a361f98664d2 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -1,15 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 
 struct uring_cache {
-	union {
-		struct io_cache_entry cache;
-		struct io_uring_sqe sqes[2];
-	};
+	struct io_uring_sqe sqes[2];
 };
 
 int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
 int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-void io_uring_cache_free(struct io_cache_entry *entry);
 
 bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
 				   struct task_struct *task, bool cancel_all);
-- 
cgit v1.2.3-70-g09d2


From df604d2ad480fcf7b39767280c9093e13b1de952 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 17 Apr 2024 09:23:55 -0600
Subject: io_uring/rw: ensure retry condition isn't lost

A previous commit removed the checking on whether or not it was possible
to retry a request, since it's now possible to retry any of them. This
would previously have caused the request to have been ended with an error,
but now the retry condition can simply get lost instead.

Cleanup the retry handling and always just punt it to task_work, which
will queue it with io-wq appropriately.

Reported-by: Changhui Zhong <czhong@redhat.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Fixes: cca6571381a0 ("io_uring/rw: cleanup retry path")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 13 +++++++++++++
 io_uring/io_uring.h |  1 +
 io_uring/rw.c       | 13 ++++++-------
 3 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'io_uring/rw.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3c9087f37c43..c67ae6e36c4f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -527,6 +527,19 @@ static void io_queue_iowq(struct io_kiocb *req)
 		io_queue_linked_timeout(link);
 }
 
+static void io_tw_requeue_iowq(struct io_kiocb *req, struct io_tw_state *ts)
+{
+	req->flags &= ~REQ_F_REISSUE;
+	io_queue_iowq(req);
+}
+
+void io_tw_queue_iowq(struct io_kiocb *req)
+{
+	req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
+	req->io_task_work.func = io_tw_requeue_iowq;
+	io_req_task_work_add(req);
+}
+
 static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 {
 	while (!list_empty(&ctx->defer_list)) {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 624ca9076a50..b83a719c5443 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -75,6 +75,7 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_queue(struct io_kiocb *req);
+void io_tw_queue_iowq(struct io_kiocb *req);
 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
 void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 3134a6ece1be..4fed829fe97c 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -455,7 +455,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 			 * current cycle.
 			 */
 			io_req_io_end(req);
-			req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
+			io_tw_queue_iowq(req);
 			return true;
 		}
 		req_set_fail(req);
@@ -521,7 +521,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 		io_req_end_write(req);
 	if (unlikely(res != req->cqe.res)) {
 		if (res == -EAGAIN && io_rw_should_reissue(req)) {
-			req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
+			io_tw_queue_iowq(req);
 			return;
 		}
 		req->cqe.res = res;
@@ -839,7 +839,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	ret = io_iter_do_read(rw, &io->iter);
 
 	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
-		req->flags &= ~REQ_F_REISSUE;
+		if (req->flags & REQ_F_REISSUE)
+			return IOU_ISSUE_SKIP_COMPLETE;
 		/* If we can poll, just do that. */
 		if (io_file_can_poll(req))
 			return -EAGAIN;
@@ -1034,10 +1035,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	else
 		ret2 = -EINVAL;
 
-	if (req->flags & REQ_F_REISSUE) {
-		req->flags &= ~REQ_F_REISSUE;
-		ret2 = -EAGAIN;
-	}
+	if (req->flags & REQ_F_REISSUE)
+		return IOU_ISSUE_SKIP_COMPLETE;
 
 	/*
 	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
-- 
cgit v1.2.3-70-g09d2


From 039a2e800bcd5beb89909d1a488abf3d647642cf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 25 Apr 2024 09:04:32 -0600
Subject: io_uring/rw: reinstate thread check for retries

Allowing retries for everything is arguably the right thing to do, now
that every command type is async read from the start. But it's exposed a
few issues around missing check for a retry (which cca6571381a0 exposed),
and the fixup commit for that isn't necessarily 100% sound in terms of
iov_iter state.

For now, just revert these two commits. This unfortunately then re-opens
the fact that -EAGAIN can get bubbled to userspace for some cases where
the kernel very well could just sanely retry them. But until we have all
the conditions covered around that, we cannot safely enable that.

This reverts commit df604d2ad480fcf7b39767280c9093e13b1de952.
This reverts commit cca6571381a0bdc88021a1f7a4c2349df21279f7.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 13 -------------
 io_uring/io_uring.h |  1 -
 io_uring/rw.c       | 40 +++++++++++++++++++++++++++++-----------
 3 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'io_uring/rw.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 64845634d89f..2675cffbd9a4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -527,19 +527,6 @@ static void io_queue_iowq(struct io_kiocb *req)
 		io_queue_linked_timeout(link);
 }
 
-static void io_tw_requeue_iowq(struct io_kiocb *req, struct io_tw_state *ts)
-{
-	req->flags &= ~REQ_F_REISSUE;
-	io_queue_iowq(req);
-}
-
-void io_tw_queue_iowq(struct io_kiocb *req)
-{
-	req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
-	req->io_task_work.func = io_tw_requeue_iowq;
-	io_req_task_work_add(req);
-}
-
 static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 {
 	while (!list_empty(&ctx->defer_list)) {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index b83a719c5443..624ca9076a50 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -75,7 +75,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_queue(struct io_kiocb *req);
-void io_tw_queue_iowq(struct io_kiocb *req);
 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
 void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 4fed829fe97c..a6bf2ea8db91 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -396,9 +396,16 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 	return NULL;
 }
 
+#ifdef CONFIG_BLOCK
+static void io_resubmit_prep(struct io_kiocb *req)
+{
+	struct io_async_rw *io = req->async_data;
+
+	iov_iter_restore(&io->iter, &io->iter_state);
+}
+
 static bool io_rw_should_reissue(struct io_kiocb *req)
 {
-#ifdef CONFIG_BLOCK
 	umode_t mode = file_inode(req->file)->i_mode;
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -414,11 +421,23 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 	 */
 	if (percpu_ref_is_dying(&ctx->refs))
 		return false;
+	/*
+	 * Play it safe and assume not safe to re-import and reissue if we're
+	 * not in the original thread group (or in task context).
+	 */
+	if (!same_thread_group(req->task, current) || !in_task())
+		return false;
 	return true;
+}
 #else
+static void io_resubmit_prep(struct io_kiocb *req)
+{
+}
+static bool io_rw_should_reissue(struct io_kiocb *req)
+{
 	return false;
-#endif
 }
+#endif
 
 static void io_req_end_write(struct io_kiocb *req)
 {
@@ -455,7 +474,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 			 * current cycle.
 			 */
 			io_req_io_end(req);
-			io_tw_queue_iowq(req);
+			req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
 			return true;
 		}
 		req_set_fail(req);
@@ -521,7 +540,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 		io_req_end_write(req);
 	if (unlikely(res != req->cqe.res)) {
 		if (res == -EAGAIN && io_rw_should_reissue(req)) {
-			io_tw_queue_iowq(req);
+			req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
 			return;
 		}
 		req->cqe.res = res;
@@ -583,10 +602,8 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 	}
 
 	if (req->flags & REQ_F_REISSUE) {
-		struct io_async_rw *io = req->async_data;
-
 		req->flags &= ~REQ_F_REISSUE;
-		iov_iter_restore(&io->iter, &io->iter_state);
+		io_resubmit_prep(req);
 		return -EAGAIN;
 	}
 	return IOU_ISSUE_SKIP_COMPLETE;
@@ -839,8 +856,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	ret = io_iter_do_read(rw, &io->iter);
 
 	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
-		if (req->flags & REQ_F_REISSUE)
-			return IOU_ISSUE_SKIP_COMPLETE;
+		req->flags &= ~REQ_F_REISSUE;
 		/* If we can poll, just do that. */
 		if (io_file_can_poll(req))
 			return -EAGAIN;
@@ -1035,8 +1051,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	else
 		ret2 = -EINVAL;
 
-	if (req->flags & REQ_F_REISSUE)
-		return IOU_ISSUE_SKIP_COMPLETE;
+	if (req->flags & REQ_F_REISSUE) {
+		req->flags &= ~REQ_F_REISSUE;
+		ret2 = -EAGAIN;
+	}
 
 	/*
 	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
-- 
cgit v1.2.3-70-g09d2