diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-30 15:43:02 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-30 15:43:02 -0800 |
commit | dd54fcced81d479d77acbeb4eea74b9ab9276bff (patch) | |
tree | 848633b6e3c56138cf8ab68a74f15f4f7878f51a /io_uring | |
parent | 133577cad6bf48e5a7848c4338124081393bfe8a (diff) | |
parent | 7eb75ce7527129d7f1fee6951566af409a37a1c4 (diff) |
Merge tag 'io_uring-6.13-20242901' of git://git.kernel.dk/linux
Pull more io_uring updates from Jens Axboe:
- Remove a leftover struct from when the cqwait registered waiting was
transitioned to regions.
- Fix for an issue introduced in this merge window, where nop->fd might
be used uninitialized. Ensure it's always set.
- Add capping of the task_work run in local task_work mode, to prevent
bursty and long chains from adding too much latency.
- Work around xa_store() leaving ->head non-NULL if it encounters an
allocation error during storing. Just a debug trigger, and can go
away once xa_store() behaves in a more expected way for this
condition. Not a major thing as it basically requires fault injection
to trigger it.
- Fix a few mapping corner cases
- Fix KCSAN complaint on reading the table size post unlock. Again not
a "real" issue, but it's easy to silence by just keeping the reading
inside the lock that protects it.
* tag 'io_uring-6.13-20242901' of git://git.kernel.dk/linux:
io_uring/tctx: work around xa_store() allocation error issue
io_uring: fix corner case forgetting to vunmap
io_uring: fix task_work cap overshooting
io_uring: check for overflows in io_pin_pages
io_uring/nop: ensure nop->fd is always initialized
io_uring: limit local tw done
io_uring: add io_local_work_pending()
io_uring/region: return negative -E2BIG in io_create_region()
io_uring: protect register tracing
io_uring: remove io_uring_cqwait_reg_arg
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/io_uring.c | 75 | ||||
-rw-r--r-- | io_uring/io_uring.h | 9 | ||||
-rw-r--r-- | io_uring/memmap.c | 13 | ||||
-rw-r--r-- | io_uring/nop.c | 6 | ||||
-rw-r--r-- | io_uring/register.c | 3 | ||||
-rw-r--r-- | io_uring/tctx.c | 13 |
6 files changed, 86 insertions, 33 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 801293399883..06ff41484e29 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -121,6 +121,7 @@ #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 +#define IO_LOCAL_TW_DEFAULT_MAX 20 struct io_defer_entry { struct list_head list; @@ -1255,12 +1256,14 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) struct llist_node *node = llist_del_all(&ctx->work_llist); __io_fallback_tw(node, false); + node = llist_del_all(&ctx->retry_llist); + __io_fallback_tw(node, false); } static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, int min_events) { - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return false; if (events < min_events) return true; @@ -1269,8 +1272,29 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, return false; } +static int __io_run_local_work_loop(struct llist_node **node, + struct io_tw_state *ts, + int events) +{ + int ret = 0; + + while (*node) { + struct llist_node *next = (*node)->next; + struct io_kiocb *req = container_of(*node, struct io_kiocb, + io_task_work.node); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, ts); + *node = next; + if (++ret >= events) + break; + } + + return ret; +} + static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, - int min_events) + int min_events, int max_events) { struct llist_node *node; unsigned int loops = 0; @@ -1281,25 +1305,23 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: + min_events -= ret; + ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events); + if (ctx->retry_llist.first) + goto retry_done; + /* * llists are in reverse order, flip it back the right way before * running the pending items. */ node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - while (node) { - struct llist_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - INDIRECT_CALL_2(req->io_task_work.func, - io_poll_task_func, io_req_rw_complete, - req, ts); - ret++; - node = next; - } + ret += __io_run_local_work_loop(&node, ts, max_events - ret); + ctx->retry_llist.first = node; loops++; if (io_run_local_work_continue(ctx, ret, min_events)) goto again; +retry_done: io_submit_flush_completions(ctx); if (io_run_local_work_continue(ctx, ret, min_events)) goto again; @@ -1313,18 +1335,20 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, { struct io_tw_state ts = {}; - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return 0; - return __io_run_local_work(ctx, &ts, min_events); + return __io_run_local_work(ctx, &ts, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); } -static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) +static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, + int max_events) { struct io_tw_state ts = {}; int ret; mutex_lock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts, min_events); + ret = __io_run_local_work(ctx, &ts, min_events, max_events); mutex_unlock(&ctx->uring_lock); return ret; } @@ -2328,9 +2352,9 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (!llist_empty(&ctx->work_llist)) { + if (io_local_work_pending(ctx)) { __set_current_state(TASK_RUNNING); - if (io_run_local_work(ctx, INT_MAX) > 0) + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) return 0; } if (io_run_task_work() > 0) @@ -2459,7 +2483,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, { if (unlikely(READ_ONCE(ctx->check_cq))) return 1; - if (unlikely(!llist_empty(&ctx->work_llist))) + if (unlikely(io_local_work_pending(ctx))) return 1; if (unlikely(task_work_pending(current))) return 1; @@ -2493,8 +2517,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (!io_allowed_run_tw(ctx)) return -EEXIST; - if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx, min_events); + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); io_run_task_work(); if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) @@ -2564,8 +2589,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ - if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx, nr_wait); + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, nr_wait, nr_wait); io_run_task_work(); /* @@ -3077,7 +3102,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) - ret |= io_run_local_work(ctx, INT_MAX) > 0; + ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); ret |= io_poll_remove_all(ctx, tctx, cancel_all); @@ -3158,7 +3183,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) io_run_task_work(); io_uring_drop_tctx_refs(current); xa_for_each(&tctx->xa, index, node) { - if (!llist_empty(&node->ctx->work_llist)) { + if (io_local_work_pending(node->ctx)) { WARN_ON_ONCE(node->ctx->submitter_task && node->ctx->submitter_task != current); goto end_wait; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 4070d4c8ef97..12abee607e4a 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -347,9 +347,14 @@ static inline int io_run_task_work(void) return ret; } +static inline bool io_local_work_pending(struct io_ring_ctx *ctx) +{ + return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); +} + static inline bool io_task_work_pending(struct io_ring_ctx *ctx) { - return task_work_pending(current) || !llist_empty(&ctx->work_llist); + return task_work_pending(current) || io_local_work_pending(ctx); } static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) @@ -484,6 +489,6 @@ enum { static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - !llist_empty(&ctx->work_llist); + io_local_work_pending(ctx); } #endif diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 6e6ee79ba94f..57de9bccbf50 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -73,6 +73,8 @@ void *io_pages_map(struct page ***out_pages, unsigned short *npages, ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); if (!IS_ERR(ret)) goto done; + if (nr_pages == 1) + goto fail; ret = io_mem_alloc_single(pages, nr_pages, size, gfp); if (!IS_ERR(ret)) { @@ -81,7 +83,7 @@ done: *npages = nr_pages; return ret; } - +fail: kvfree(pages); *out_pages = NULL; *npages = 0; @@ -136,7 +138,12 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) struct page **pages; int ret; - end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (check_add_overflow(uaddr, len, &end)) + return ERR_PTR(-EOVERFLOW); + if (check_add_overflow(end, PAGE_SIZE - 1, &end)) + return ERR_PTR(-EOVERFLOW); + + end = end >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; nr_pages = end - start; if (WARN_ON_ONCE(!nr_pages)) @@ -229,7 +236,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, if (!reg->size || reg->mmap_offset || reg->id) return -EINVAL; if ((reg->size >> PAGE_SHIFT) > INT_MAX) - return E2BIG; + return -E2BIG; if ((reg->user_addr | reg->size) & ~PAGE_MASK) return -EINVAL; if (check_add_overflow(reg->user_addr, reg->size, &end)) diff --git a/io_uring/nop.c b/io_uring/nop.c index 6d470d4251ee..5e5196df650a 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -35,10 +35,14 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) nop->result = READ_ONCE(sqe->len); else nop->result = 0; - if (nop->flags & IORING_NOP_FIXED_FILE) + if (nop->flags & IORING_NOP_FILE) nop->fd = READ_ONCE(sqe->fd); + else + nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) nop->buffer = READ_ONCE(sqe->buf_index); + else + nop->buffer = -1; return 0; } diff --git a/io_uring/register.c b/io_uring/register.c index 1a60f4916649..1e99c783abdf 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -905,9 +905,10 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); - mutex_unlock(&ctx->uring_lock); + trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, ctx->buf_table.nr, ret); + mutex_unlock(&ctx->uring_lock); if (!use_registered_ring) fput(file); return ret; diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 503f3ff8bc4f..adc6e42c14df 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -47,8 +47,19 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, void __io_uring_free(struct task_struct *tsk) { struct io_uring_task *tctx = tsk->io_uring; + struct io_tctx_node *node; + unsigned long index; - WARN_ON_ONCE(!xa_empty(&tctx->xa)); + /* + * Fault injection forcing allocation errors in the xa_store() path + * can lead to xa_empty() returning false, even though no actual + * node is stored in the xarray. Until that gets sorted out, attempt + * an iteration here and warn if any entries are found. + */ + xa_for_each(&tctx->xa, index, node) { + WARN_ON_ONCE(1); + break; + } WARN_ON_ONCE(tctx->io_wq); WARN_ON_ONCE(tctx->cached_refs); |