diff options
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/cancel.c | 9 | ||||
-rw-r--r-- | io_uring/fdinfo.c | 12 | ||||
-rw-r--r-- | io_uring/io-wq.c | 6 | ||||
-rw-r--r-- | io_uring/io_uring.c | 49 | ||||
-rw-r--r-- | io_uring/io_uring.h | 15 | ||||
-rw-r--r-- | io_uring/poll.c | 48 | ||||
-rw-r--r-- | io_uring/rw.c | 6 |
7 files changed, 103 insertions, 42 deletions
diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 2291a53cdabd..b4f5dfacc0c3 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -288,24 +288,23 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); + mutex_unlock(&ctx->uring_lock); if (ret != -EALREADY) break; - mutex_unlock(&ctx->uring_lock); ret = io_run_task_work_sig(ctx); - if (ret < 0) { - mutex_lock(&ctx->uring_lock); + if (ret < 0) break; - } ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS); - mutex_lock(&ctx->uring_lock); if (!ret) { ret = -ETIME; break; } + mutex_lock(&ctx->uring_lock); } while (1); finish_wait(&ctx->cq_wait, &wait); + mutex_lock(&ctx->uring_lock); if (ret == -ENOENT || ret > 0) ret = 0; diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 2e04850a657b..882bd56b01ed 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -170,12 +170,11 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, xa_for_each(&ctx->personalities, index, cred) io_uring_show_cred(m, index, cred); } - if (has_lock) - mutex_unlock(&ctx->uring_lock); seq_puts(m, "PollList:\n"); for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; + struct io_hash_bucket *hbl = &ctx->cancel_table_locked.hbs[i]; struct io_kiocb *req; spin_lock(&hb->lock); @@ -183,8 +182,17 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, seq_printf(m, " op=%d, task_works=%d\n", req->opcode, task_work_pending(req->task)); spin_unlock(&hb->lock); + + if (!has_lock) + continue; + hlist_for_each_entry(req, &hbl->list, hash_node) + seq_printf(m, " op=%d, task_works=%d\n", req->opcode, + task_work_pending(req->task)); } + if (has_lock) + mutex_unlock(&ctx->uring_lock); + seq_puts(m, "CqOverflowList:\n"); spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 6f1d0e5df23a..411bb2d1acd4 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1230,6 +1230,12 @@ static void io_wq_cancel_tw_create(struct io_wq *wq) worker = container_of(cb, struct io_worker, create_work); io_worker_cancel_cb(worker); + /* + * Only the worker continuation helper has worker allocated and + * hence needs freeing. + */ + if (cb->func == create_worker_cont) + kfree(worker); } } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ff2bbac1a10f..2ac1cd8d23ea 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -677,16 +677,20 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cq_unlock_post(ctx); } +static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) +{ + /* iopoll syncs against uring_lock, not completion_lock */ + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_lock(&ctx->uring_lock); + __io_cqring_overflow_flush(ctx); + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_unlock(&ctx->uring_lock); +} + static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) { - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { - /* iopoll syncs against uring_lock, not completion_lock */ - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - __io_cqring_overflow_flush(ctx); - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); - } + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) + io_cqring_do_overflow_flush(ctx); } void __io_put_task(struct task_struct *task, int nr) @@ -727,6 +731,8 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, size_t ocq_size = sizeof(struct io_overflow_cqe); bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + lockdep_assert_held(&ctx->completion_lock); + if (is_cqe32) ocq_size += sizeof(struct io_uring_cqe); @@ -816,9 +822,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, { struct io_uring_cqe *cqe; - if (!ctx->task_complete) - lockdep_assert_held(&ctx->completion_lock); - ctx->cq_extra++; /* @@ -1232,13 +1235,18 @@ static void io_req_local_work_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) + percpu_ref_get(&ctx->refs); + + if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) { + percpu_ref_put(&ctx->refs); return; + } /* need it for the following io_cqring_wake() */ smp_mb__after_atomic(); if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { io_move_task_work_from_local(ctx); + percpu_ref_put(&ctx->refs); return; } @@ -1248,6 +1256,7 @@ static void io_req_local_work_add(struct io_kiocb *req) if (ctx->has_evfd) io_eventfd_signal(ctx); __io_cqring_wake(ctx); + percpu_ref_put(&ctx->refs); } void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) @@ -2461,7 +2470,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - ktime_t timeout) + ktime_t *timeout) { int ret; unsigned long check_cq; @@ -2479,7 +2488,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) return -EBADR; } - if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) + if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS)) return -ETIME; /* @@ -2549,10 +2558,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, trace_io_uring_cqring_wait(ctx, min_events); do { - io_cqring_overflow_flush(ctx); + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { + finish_wait(&ctx->cq_wait, &iowq.wq); + io_cqring_do_overflow_flush(ctx); + } prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, timeout); + ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); if (__io_cqring_events_user(ctx) >= min_events) break; cond_resched(); @@ -4013,8 +4025,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return -EEXIST; if (ctx->restricted) { - if (opcode >= IORING_REGISTER_LAST) - return -EINVAL; opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; @@ -4170,6 +4180,9 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, long ret = -EBADF; struct fd f; + if (opcode >= IORING_REGISTER_LAST) + return -EINVAL; + f = fdget(fd); if (!f.file) return -EBADF; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e9f0d41ebb99..ab4b2a1c3b7e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -79,6 +79,19 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); +#define io_lockdep_assert_cq_locked(ctx) \ + do { \ + if (ctx->flags & IORING_SETUP_IOPOLL) { \ + lockdep_assert_held(&ctx->uring_lock); \ + } else if (!ctx->task_complete) { \ + lockdep_assert_held(&ctx->completion_lock); \ + } else if (ctx->submitter_task->flags & PF_EXITING) { \ + lockdep_assert(current_work()); \ + } else { \ + lockdep_assert(current == ctx->submitter_task); \ + } \ + } while (0) + static inline void io_req_task_work_add(struct io_kiocb *req) { __io_req_task_work_add(req, true); @@ -92,6 +105,8 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx); static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, bool overflow) { + io_lockdep_assert_cq_locked(ctx); + if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { struct io_uring_cqe *cqe = ctx->cqe_cached; diff --git a/io_uring/poll.c b/io_uring/poll.c index ee7da6150ec4..32e5fc8365e6 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -223,21 +223,22 @@ enum { IOU_POLL_DONE = 0, IOU_POLL_NO_ACTION = 1, IOU_POLL_REMOVE_POLL_USE_RES = 2, + IOU_POLL_REISSUE = 3, }; /* * All poll tw should go through this. Checks for poll events, manages * references, does rewait, etc. * - * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action require, - * which is either spurious wakeup or multishot CQE is served. - * IOU_POLL_DONE when it's done with the request, then the mask is stored in req->cqe.res. - * IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot poll and that the result - * is stored in req->cqe. + * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action + * require, which is either spurious wakeup or multishot CQE is served. + * IOU_POLL_DONE when it's done with the request, then the mask is stored in + * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot + * poll and that the result is stored in req->cqe. */ static int io_poll_check_events(struct io_kiocb *req, bool *locked) { - int v, ret; + int v; /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) @@ -276,10 +277,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) if (!req->cqe.res) { struct poll_table_struct pt = { ._key = req->apoll_events }; req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; + /* + * We got woken with a mask, but someone else got to + * it first. The above vfs_poll() doesn't add us back + * to the waitqueue, so if we get nothing back, we + * should be safe and attempt a reissue. + */ + if (unlikely(!req->cqe.res)) + return IOU_POLL_REISSUE; } - - if ((unlikely(!req->cqe.res))) - continue; if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; @@ -294,7 +300,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) return IOU_POLL_REMOVE_POLL_USE_RES; } } else { - ret = io_poll_issue(req, locked); + int ret = io_poll_issue(req, locked); if (ret == IOU_STOP_MULTISHOT) return IOU_POLL_REMOVE_POLL_USE_RES; if (ret < 0) @@ -330,6 +336,9 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); + } else if (ret == IOU_POLL_REISSUE) { + io_req_task_submit(req, locked); + return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; req_set_fail(req); @@ -342,7 +351,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) if (ret == IOU_POLL_REMOVE_POLL_USE_RES) io_req_task_complete(req, locked); - else if (ret == IOU_POLL_DONE) + else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) io_req_task_submit(req, locked); else io_req_defer_failed(req, ret); @@ -533,6 +542,14 @@ static bool io_poll_can_finish_inline(struct io_kiocb *req, return pt->owning || io_poll_get_ownership(req); } +static void io_poll_add_hash(struct io_kiocb *req) +{ + if (req->flags & REQ_F_HASH_LOCKED) + io_poll_req_insert_locked(req); + else + io_poll_req_insert(req); +} + /* * Returns 0 when it's handed over for polling. The caller owns the requests if * it returns non-zero, but otherwise should not touch it. Negative values @@ -591,18 +608,17 @@ static int __io_arm_poll_handler(struct io_kiocb *req, if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { - if (!io_poll_can_finish_inline(req, ipt)) + if (!io_poll_can_finish_inline(req, ipt)) { + io_poll_add_hash(req); return 0; + } io_poll_remove_entries(req); ipt->result_mask = mask; /* no one else has access to the req, forget about the ref */ return 1; } - if (req->flags & REQ_F_HASH_LOCKED) - io_poll_req_insert_locked(req); - else - io_poll_req_insert(req); + io_poll_add_hash(req); if (mask && (poll->events & EPOLLET) && io_poll_can_finish_inline(req, ipt)) { diff --git a/io_uring/rw.c b/io_uring/rw.c index 8227af2e1c0f..9c3ddd46a1ad 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1062,7 +1062,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) continue; req->cqe.flags = io_put_kbuf(req, 0); - io_fill_cqe_req(req->ctx, req); + if (unlikely(!__io_fill_cqe_req(ctx, req))) { + spin_lock(&ctx->completion_lock); + io_req_cqe_overflow(req); + spin_unlock(&ctx->completion_lock); + } } if (unlikely(!nr_events)) |