diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-24 11:11:38 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-24 11:11:38 -0700 |
commit | 3147a0689dd9793990ff954369ffcdf2de984b46 (patch) | |
tree | 8c09e627f830e204fdc2009e46c3509b76d6b189 /io_uring | |
parent | 172d513936c707e991c3eca1b79cd8a153171862 (diff) | |
parent | eac2ca2d682f94f46b1973bdf5e77d85d77b8e53 (diff) |
Merge tag 'for-6.12/io_uring-20240922' of git://git.kernel.dk/linux
Pull more io_uring updates from Jens Axboe:
"Mostly just a set of fixes in here, or little changes that didn't get
included in the initial pull request. This contains:
- Move the SQPOLL napi polling outside the submission lock (Olivier)
- Rename of the "copy buffers" API that got added in the 6.12 merge
window. There's really no copying going on, it's just referencing
the buffers. After a bit of consideration, decided that it was
better to simply rename this to avoid potential confusion (me)
- Shrink struct io_mapped_ubuf from 48 to 32 bytes, by changing it to
start + len tracking rather than having start / end in there, and
by removing the caching of folio_mask when we can just calculate it
from folio_shift when we need it (me)
- Fixes for the SQPOLL affinity checking (me, Felix)
- Fix for how cqring waiting checks for the presence of task_work.
Just check it directly rather than check for a specific
notification mechanism (me)
- Tweak to how request linking is represented in tracing (me)
- Fix a syzbot report that deliberately sets up a huge list of
overflow entries, and then hits rcu stalls when flushing this list.
Just check for the need to preempt, and drop/reacquire locks in the
loop. There's no state maintained over the loop itself, and each
entry is yanked from head-of-list (me)"
* tag 'for-6.12/io_uring-20240922' of git://git.kernel.dk/linux:
io_uring: check if we need to reschedule during overflow flush
io_uring: improve request linking trace
io_uring: check for presence of task_work rather than TIF_NOTIFY_SIGNAL
io_uring/sqpoll: do the napi busy poll outside the submission block
io_uring: clean up a type in io_uring_register_get_file()
io_uring/sqpoll: do not put cpumask on stack
io_uring/sqpoll: retain test for whether the CPU is valid
io_uring/rsrc: change ubuf->ubuf_end to length tracking
io_uring/rsrc: get rid of io_mapped_ubuf->folio_mask
io_uring: rename "copy buffers" to "clone buffers"
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/fdinfo.c | 3 | ||||
-rw-r--r-- | io_uring/io_uring.c | 21 | ||||
-rw-r--r-- | io_uring/register.c | 6 | ||||
-rw-r--r-- | io_uring/register.h | 2 | ||||
-rw-r--r-- | io_uring/rsrc.c | 23 | ||||
-rw-r--r-- | io_uring/rsrc.h | 7 | ||||
-rw-r--r-- | io_uring/sqpoll.c | 22 |
7 files changed, 52 insertions, 32 deletions
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index d43e1b5fcb36..6b1247664b35 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -177,9 +177,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { struct io_mapped_ubuf *buf = ctx->user_bufs[i]; - unsigned int len = buf->ubuf_end - buf->ubuf; - seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); + seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len); } if (has_lock && !xa_empty(&ctx->personalities)) { unsigned long index; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f3570e81ecb4..feb61d68dca6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -635,6 +635,21 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) } list_del(&ocqe->list); kfree(ocqe); + + /* + * For silly syzbot cases that deliberately overflow by huge + * amounts, check if we need to resched and drop and + * reacquire the locks if so. Nothing real would ever hit this. + * Ideally we'd have a non-posting unlock for this, but hard + * to care for a non-real case. + */ + if (need_resched()) { + io_cq_unlock_post(ctx); + mutex_unlock(&ctx->uring_lock); + cond_resched(); + mutex_lock(&ctx->uring_lock); + io_cq_lock(ctx); + } } if (list_empty(&ctx->cq_overflow_list)) { @@ -2164,7 +2179,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, * conditions are true (normal request), then just queue it. */ if (unlikely(link->head)) { - trace_io_uring_link(req, link->head); + trace_io_uring_link(req, link->last); link->last->link = req; link->last = req; @@ -2472,7 +2487,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, return 1; if (unlikely(!llist_empty(&ctx->work_llist))) return 1; - if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) + if (unlikely(task_work_pending(current))) return 1; if (unlikely(task_sigpending(current))) return -EINTR; @@ -2579,9 +2594,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ - io_run_task_work(); if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx, nr_wait); + io_run_task_work(); /* * Non-local task_work will be run on exit to userspace, but diff --git a/io_uring/register.c b/io_uring/register.c index dab0f8024ddf..eca26d4884d9 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -542,11 +542,11 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clock(ctx, arg); break; - case IORING_REGISTER_COPY_BUFFERS: + case IORING_REGISTER_CLONE_BUFFERS: ret = -EINVAL; if (!arg || nr_args != 1) break; - ret = io_register_copy_buffers(ctx, arg); + ret = io_register_clone_buffers(ctx, arg); break; default: ret = -EINVAL; @@ -561,7 +561,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, * true, then the registered index is used. Otherwise, the normal fd table. * Caller must call fput() on the returned file, unless it's an ERR_PTR. */ -struct file *io_uring_register_get_file(int fd, bool registered) +struct file *io_uring_register_get_file(unsigned int fd, bool registered) { struct file *file; diff --git a/io_uring/register.h b/io_uring/register.h index cc69b88338fe..a5f39d5ef9e0 100644 --- a/io_uring/register.h +++ b/io_uring/register.h @@ -4,6 +4,6 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); -struct file *io_uring_register_get_file(int fd, bool registered); +struct file *io_uring_register_get_file(unsigned int fd, bool registered); #endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index a7164aa7d13e..33a3d156a85b 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -38,7 +38,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, static const struct io_mapped_ubuf dummy_ubuf = { /* set invalid range, so io_import_fixed() fails meeting it */ .ubuf = -1UL, - .ubuf_end = 0, + .len = UINT_MAX, }; int __io_account_mem(struct user_struct *user, unsigned long nr_pages) @@ -991,16 +991,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; - imu->ubuf_end = imu->ubuf + iov->iov_len; + imu->len = iov->iov_len; imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; - imu->folio_mask = PAGE_MASK; - if (coalesced) { + if (coalesced) imu->folio_shift = data.folio_shift; - imu->folio_mask = ~((1UL << data.folio_shift) - 1); - } refcount_set(&imu->refs, 1); - off = (unsigned long) iov->iov_base & ~imu->folio_mask; + off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); *pimu = imu; ret = 0; @@ -1100,7 +1097,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ - if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) + if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; /* @@ -1143,14 +1140,14 @@ int io_import_fixed(int ddir, struct iov_iter *iter, iter->bvec = bvec + seg_skip; iter->nr_segs -= seg_skip; iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ~imu->folio_mask; + iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); } } return 0; } -static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) +static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) { struct io_mapped_ubuf **user_bufs; struct io_rsrc_data *data; @@ -1214,9 +1211,9 @@ out_unlock: * * Since the memory is already accounted once, don't account it again. */ -int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg) +int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) { - struct io_uring_copy_buffers buf; + struct io_uring_clone_buffers buf; bool registered_src; struct file *file; int ret; @@ -1234,7 +1231,7 @@ int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg) file = io_uring_register_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file); - ret = io_copy_buffers(ctx, file->private_data); + ret = io_clone_buffers(ctx, file->private_data); if (!registered_src) fput(file); return ret; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 93546ab337a6..8ed588036210 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -42,12 +42,11 @@ struct io_rsrc_node { struct io_mapped_ubuf { u64 ubuf; - u64 ubuf_end; + unsigned int len; unsigned int nr_bvecs; unsigned int folio_shift; - unsigned long acct_pages; - unsigned long folio_mask; refcount_t refs; + unsigned long acct_pages; struct bio_vec bvec[] __counted_by(nr_bvecs); }; @@ -68,7 +67,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len); -int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg); +int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 4a59a024278a..a26593979887 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -196,9 +196,6 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - if (io_napi(ctx)) - ret += io_napi_sqpoll_busy_poll(ctx); - if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) wake_up(&ctx->sqo_sq_wait); if (creds) @@ -323,6 +320,10 @@ static int io_sq_thread(void *data) if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) sqt_spin = true; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + if (io_napi(ctx)) + io_napi_sqpoll_busy_poll(ctx); + if (sqt_spin || !time_after(jiffies, timeout)) { if (sqt_spin) { io_sq_update_worktime(sqd, &start); @@ -461,13 +462,22 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, return 0; if (p->flags & IORING_SETUP_SQ_AFF) { - struct cpumask allowed_mask; + cpumask_var_t allowed_mask; int cpu = p->sq_thread_cpu; ret = -EINVAL; - cpuset_cpus_allowed(current, &allowed_mask); - if (!cpumask_test_cpu(cpu, &allowed_mask)) + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + goto err_sqpoll; + ret = -ENOMEM; + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) + goto err_sqpoll; + ret = -EINVAL; + cpuset_cpus_allowed(current, allowed_mask); + if (!cpumask_test_cpu(cpu, allowed_mask)) { + free_cpumask_var(allowed_mask); goto err_sqpoll; + } + free_cpumask_var(allowed_mask); sqd->sq_cpu = cpu; } else { sqd->sq_cpu = -1; |