Merge tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe: "This contains the core io_uring updates, of which there are not many, and adds support for using WAITID through io_uring and hence not needing to block on these kinds of events. Outside of that, tweaks to the legacy provided buffer handling and some cleanups related to cancelations for uring_cmd support" * tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux: io_uring/poll: use IOU_F_TWQ_LAZY_WAKE for wakeups io_uring/kbuf: Use slab for struct io_buffer objects io_uring/kbuf: Allow the full buffer id space for provided buffers io_uring/kbuf: Fix check of BID wrapping in provided buffers io_uring/rsrc: cleanup io_pin_pages() io_uring: cancelable uring_cmd io_uring: retain top 8bits of uring_cmd flags for kernel internal use io_uring: add IORING_OP_WAITID support exit: add internal include file with helpers exit: add kernel_waitid_prepare() helper exit: move core of do_wait() into helper exit: abstract out should_wake helper for child_wait_callback() io_uring/rw: add support for IORING_OP_READ_MULTISHOT io_uring/rw: mark readv/writev as vectored in the opcode definition io_uring/rw: split io_read() into a helper
author: Linus Torvalds <torvalds@linux-foundation.org> 2023-11-01 11:09:19 -1000
committer: Linus Torvalds <torvalds@linux-foundation.org> 2023-11-01 11:09:19 -1000
commit: ffa059b262ba72571e7fefe7fa2b4ebb6776b277 (patch)
tree: d837908365664bc1a7192f89609546a60cff8e37 /io_uring/kbuf.c
parent: ca995ce438cc641c47d4b8e4abeb1878a3d07c5f (diff)
parent: 6ce4a93dbb5bd93bc2bdf14da63f9360a4dcd6a1 (diff)
1 files changed, 33 insertions, 25 deletions
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 9123138aa9f4..fea06810b43d 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -19,12 +19,17 @@
 
 #define BGID_ARRAY	64
 
+/* BIDs are addressed by a 16-bit field in a CQE */
+#define MAX_BIDS_PER_BGID (1 << 16)
+
+struct kmem_cache *io_buf_cachep;
+
 struct io_provide_buf {
 	struct file			*file;
 	__u64				addr;
 	__u32				len;
 	__u32				bgid;
-	__u16				nbufs;
+	__u32				nbufs;
 	__u16				bid;
 };
 
@@ -255,6 +260,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 void io_destroy_buffers(struct io_ring_ctx *ctx)
 {
 	struct io_buffer_list *bl;
+	struct list_head *item, *tmp;
+	struct io_buffer *buf;
 	unsigned long index;
 	int i;
 
@@ -270,12 +277,9 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
 		kfree(bl);
 	}
 
-	while (!list_empty(&ctx->io_buffers_pages)) {
-		struct page *page;
-
-		page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
-		list_del_init(&page->lru);
-		__free_page(page);
+	list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
+		buf = list_entry(item, struct io_buffer, list);
+		kmem_cache_free(io_buf_cachep, buf);
 	}
 }
 
@@ -289,7 +293,7 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	tmp = READ_ONCE(sqe->fd);
-	if (!tmp || tmp > USHRT_MAX)
+	if (!tmp || tmp > MAX_BIDS_PER_BGID)
 		return -EINVAL;
 
 	memset(p, 0, sizeof(*p));
@@ -332,7 +336,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 		return -EINVAL;
 
 	tmp = READ_ONCE(sqe->fd);
-	if (!tmp || tmp > USHRT_MAX)
+	if (!tmp || tmp > MAX_BIDS_PER_BGID)
 		return -E2BIG;
 	p->nbufs = tmp;
 	p->addr = READ_ONCE(sqe->addr);
@@ -352,17 +356,18 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	tmp = READ_ONCE(sqe->off);
 	if (tmp > USHRT_MAX)
 		return -E2BIG;
-	if (tmp + p->nbufs >= USHRT_MAX)
+	if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
 		return -EINVAL;
 	p->bid = tmp;
 	return 0;
 }
 
+#define IO_BUFFER_ALLOC_BATCH 64
+
 static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
 {
-	struct io_buffer *buf;
-	struct page *page;
-	int bufs_in_page;
+	struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
+	int allocated;
 
 	/*
 	 * Completions that don't happen inline (eg not under uring_lock) will
@@ -382,22 +387,25 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
 
 	/*
 	 * No free buffers and no completion entries either. Allocate a new
-	 * page worth of buffer entries and add those to our freelist.
+	 * batch of buffer entries and add those to our freelist.
 	 */
-	page = alloc_page(GFP_KERNEL_ACCOUNT);
-	if (!page)
-		return -ENOMEM;
-
-	list_add(&page->lru, &ctx->io_buffers_pages);
 
-	buf = page_address(page);
-	bufs_in_page = PAGE_SIZE / sizeof(*buf);
-	while (bufs_in_page) {
-		list_add_tail(&buf->list, &ctx->io_buffers_cache);
-		buf++;
-		bufs_in_page--;
+	allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
+					  ARRAY_SIZE(bufs), (void **) bufs);
+	if (unlikely(!allocated)) {
+		/*
+		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
+		 * retry single alloc to be on the safe side.
+		 */
+		bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
+		if (!bufs[0])
+			return -ENOMEM;
+		allocated = 1;
 	}
 
+	while (allocated)
+		list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
+
 	return 0;
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2023-11-01 11:09:19 -1000
committer	Linus Torvalds <torvalds@linux-foundation.org>	2023-11-01 11:09:19 -1000
commit	ffa059b262ba72571e7fefe7fa2b4ebb6776b277 (patch)
tree	d837908365664bc1a7192f89609546a60cff8e37 /io_uring/kbuf.c
parent	ca995ce438cc641c47d4b8e4abeb1878a3d07c5f (diff)
parent	6ce4a93dbb5bd93bc2bdf14da63f9360a4dcd6a1 (diff)