From fd2206e4e97b5bae422d9f2f9ebbc79bc97e44a5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 Jun 2020 16:40:47 -0600 Subject: io_uring: disallow close of ring itself A previous commit enabled this functionality, which also enabled O_PATH to work correctly with io_uring. But we can't safely close the ring itself, as the file handle isn't reference counted inside io_uring_enter(). Instead of jumping through hoops to enable ring closure, add a "soft" ->needs_file option, ->needs_file_no_error. This enables O_PATH file descriptors to work, but still catches the case of trying to close the ring itself. Reported-by: Jann Horn Fixes: 904fbcb115c8 ("io_uring: remove 'fd is io_uring' from close path") Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9d4bd0d3a080..417b7105c6dc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -698,6 +698,8 @@ struct io_op_def { unsigned needs_mm : 1; /* needs req->file assigned */ unsigned needs_file : 1; + /* don't fail if file grab fails */ + unsigned needs_file_no_error : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ @@ -804,6 +806,8 @@ static const struct io_op_def io_op_defs[] = { .needs_fs = 1, }, [IORING_OP_CLOSE] = { + .needs_file = 1, + .needs_file_no_error = 1, .file_table = 1, }, [IORING_OP_FILES_UPDATE] = { @@ -3421,6 +3425,10 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); + if ((req->file && req->file->f_op == &io_uring_fops) || + req->close.fd == req->ctx->ring_fd) + return -EBADF; + return 0; } @@ -5438,19 +5446,20 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); file = io_file_from_index(ctx, fd); - if (!file) - return -EBADF; - req->fixed_file_refs = ctx->file_data->cur_refs; - percpu_ref_get(req->fixed_file_refs); + if (file) { + req->fixed_file_refs = ctx->file_data->cur_refs; + percpu_ref_get(req->fixed_file_refs); + } } else { trace_io_uring_file_get(ctx, fd); file = __io_file_get(state, fd); - if (unlikely(!file)) - return -EBADF; } - *out_file = file; - return 0; + if (file || io_op_defs[req->opcode].needs_file_no_error) { + *out_file = file; + return 0; + } + return -EBADF; } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, -- cgit v1.2.3-70-g09d2 From 3232dd02af65f2d01be641120d2a710176b0c7a7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 3 Jun 2020 18:03:22 +0300 Subject: io_uring: fix {SQ,IO}POLL with unsupported opcodes IORING_SETUP_IOPOLL is defined only for read/write, other opcodes should be disallowed, otherwise it'll get an error as below. Also refuse open/close with SQPOLL, as the polling thread wouldn't know which file table to use. RIP: 0010:io_iopoll_getevents+0x111/0x5a0 Call Trace: ? _raw_spin_unlock_irqrestore+0x24/0x40 ? do_send_sig_info+0x64/0x90 io_iopoll_reap_events.part.0+0x5e/0xa0 io_ring_ctx_wait_and_kill+0x132/0x1c0 io_uring_release+0x20/0x30 __fput+0xcd/0x230 ____fput+0xe/0x10 task_work_run+0x67/0xa0 do_exit+0x353/0xb10 ? handle_mm_fault+0xd4/0x200 ? syscall_trace_enter+0x18c/0x2c0 do_group_exit+0x43/0xa0 __x64_sys_exit_group+0x18/0x20 do_syscall_64+0x60/0x1e0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: Pavel Begunkov [axboe: allow provide/remove buffers and files update] Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 417b7105c6dc..c627dd9ce096 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2766,6 +2766,8 @@ static int __io_splice_prep(struct io_kiocb *req, if (req->flags & REQ_F_NEED_CLEANUP) return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; sp->file_in = NULL; sp->len = READ_ONCE(sqe->len); @@ -2966,6 +2968,8 @@ static int io_fallocate_prep(struct io_kiocb *req, { if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) return -EINVAL; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->addr); @@ -2991,6 +2995,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) const char __user *fname; int ret; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; if (sqe->ioprio || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) @@ -3024,6 +3030,8 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; if (sqe->ioprio || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) @@ -3263,6 +3271,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #if defined(CONFIG_EPOLL) if (sqe->ioprio || sqe->buf_index) return -EINVAL; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); req->epoll.op = READ_ONCE(sqe->len); @@ -3307,6 +3317,8 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) if (sqe->ioprio || sqe->buf_index || sqe->off) return -EINVAL; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; req->madvise.addr = READ_ONCE(sqe->addr); req->madvise.len = READ_ONCE(sqe->len); @@ -3341,6 +3353,8 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (sqe->ioprio || sqe->buf_index || sqe->addr) return -EINVAL; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; req->fadvise.offset = READ_ONCE(sqe->off); req->fadvise.len = READ_ONCE(sqe->len); @@ -3374,6 +3388,8 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock) static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; if (sqe->ioprio || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) @@ -3418,6 +3434,8 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) */ req->work.flags |= IO_WQ_WORK_NO_CANCEL; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; -- cgit v1.2.3-70-g09d2 From 25e72d1012b30bdff712b563e6141a4f311d28d6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 3 Jun 2020 18:03:23 +0300 Subject: io_uring: do build_open_how() only once build_open_how() is just adjusting open_flags/mode. Do it once during prep. It looks better than storing raw values for the future. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c627dd9ce096..0c5b48467651 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2993,6 +2993,7 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock) static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { const char __user *fname; + u64 flags, mode; int ret; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) @@ -3004,13 +3005,14 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_NEED_CLEANUP) return 0; - req->open.dfd = READ_ONCE(sqe->fd); - req->open.how.mode = READ_ONCE(sqe->len); - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->open.how.flags = READ_ONCE(sqe->open_flags); + mode = READ_ONCE(sqe->len); + flags = READ_ONCE(sqe->open_flags); if (force_o_largefile()) - req->open.how.flags |= O_LARGEFILE; + flags |= O_LARGEFILE; + req->open.how = build_open_how(flags, mode); + req->open.dfd = READ_ONCE(sqe->fd); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); req->open.filename = getname(fname); if (IS_ERR(req->open.filename)) { ret = PTR_ERR(req->open.filename); @@ -3104,7 +3106,6 @@ err: static int io_openat(struct io_kiocb *req, bool force_nonblock) { - req->open.how = build_open_how(req->open.how.flags, req->open.how.mode); return io_openat2(req, force_nonblock); } -- cgit v1.2.3-70-g09d2 From ec65fea5a8d7a82d3137dd2a44197eb577da111f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 3 Jun 2020 18:03:24 +0300 Subject: io_uring: deduplicate io_openat{,2}_prep() io_openat_prep() and io_openat2_prep() are identical except for how struct open_how is built. Deduplicate it with a helper. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 55 +++++++++++++++++++------------------------------------ 1 file changed, 19 insertions(+), 36 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0c5b48467651..4823a116daf2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2990,26 +2990,21 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock) return 0; } -static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { const char __user *fname; - u64 flags, mode; int ret; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (unlikely(sqe->ioprio || sqe->buf_index)) return -EINVAL; - if (req->flags & REQ_F_FIXED_FILE) + if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; - mode = READ_ONCE(sqe->len); - flags = READ_ONCE(sqe->open_flags); - if (force_o_largefile()) - flags |= O_LARGEFILE; - req->open.how = build_open_how(flags, mode); + /* open.how should be already initialised */ + if (!(req->open.how.flags & O_PATH) && force_o_largefile()) + req->open.how.flags |= O_LARGEFILE; req->open.dfd = READ_ONCE(sqe->fd); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -3019,33 +3014,33 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->open.filename = NULL; return ret; } - req->open.nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; } +static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + u64 flags, mode; + + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; + mode = READ_ONCE(sqe->len); + flags = READ_ONCE(sqe->open_flags); + req->open.how = build_open_how(flags, mode); + return __io_openat_prep(req, sqe); +} + static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct open_how __user *how; - const char __user *fname; size_t len; int ret; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index) - return -EINVAL; - if (req->flags & REQ_F_FIXED_FILE) - return -EBADF; if (req->flags & REQ_F_NEED_CLEANUP) return 0; - - req->open.dfd = READ_ONCE(sqe->fd); - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); - if (len < OPEN_HOW_SIZE_VER0) return -EINVAL; @@ -3054,19 +3049,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ret) return ret; - if (!(req->open.how.flags & O_PATH) && force_o_largefile()) - req->open.how.flags |= O_LARGEFILE; - - req->open.filename = getname(fname); - if (IS_ERR(req->open.filename)) { - ret = PTR_ERR(req->open.filename); - req->open.filename = NULL; - return ret; - } - - req->open.nofile = rlimit(RLIMIT_NOFILE); - req->flags |= REQ_F_NEED_CLEANUP; - return 0; + return __io_openat_prep(req, sqe); } static int io_openat2(struct io_kiocb *req, bool force_nonblock) -- cgit v1.2.3-70-g09d2 From d2b6f48b691ed67569786c332f0173b918d3fd1b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 3 Jun 2020 18:03:25 +0300 Subject: io_uring: move send/recv IOPOLL check into prep Fail recv/send in case of IORING_SETUP_IOPOLL earlier during prep, so it'd be done only once. Removes duplication as well Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4823a116daf2..d2bd82387a4c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3556,6 +3556,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_async_ctx *io = req->io; int ret; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); @@ -3585,9 +3588,6 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) struct socket *sock; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - sock = sock_from_file(req->file, &ret); if (sock) { struct io_async_ctx io; @@ -3641,9 +3641,6 @@ static int io_send(struct io_kiocb *req, bool force_nonblock) struct socket *sock; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - sock = sock_from_file(req->file, &ret); if (sock) { struct io_sr_msg *sr = &req->sr_msg; @@ -3796,6 +3793,9 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io = req->io; int ret; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); @@ -3824,9 +3824,6 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) struct socket *sock; int ret, cflags = 0; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - sock = sock_from_file(req->file, &ret); if (sock) { struct io_buffer *kbuf; @@ -3888,9 +3885,6 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock) struct socket *sock; int ret, cflags = 0; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - sock = sock_from_file(req->file, &ret); if (sock) { struct io_sr_msg *sr = &req->sr_msg; -- cgit v1.2.3-70-g09d2 From dddb3e26f6d88c5344d28cb5ff9d3d6fa05c4f7a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 4 Jun 2020 11:27:01 -0600 Subject: io_uring: re-set iov base/len for buffer select retry We already have the buffer selected, but we should set the iter list again. Cc: stable@vger.kernel.org # v5.7 Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d2bd82387a4c..70f0f2f940fb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2363,8 +2363,14 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, bool needs_lock) { - if (req->flags & REQ_F_BUFFER_SELECTED) + if (req->flags & REQ_F_BUFFER_SELECTED) { + struct io_buffer *kbuf; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + iov[0].iov_len = kbuf->len; return 0; + } if (!req->rw.len) return 0; else if (req->rw.len > 1) -- cgit v1.2.3-70-g09d2 From efe68c1ca8f49e8c06afd74b699411bfbb8ba1ff Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Thu, 4 Jun 2020 18:01:52 -0700 Subject: io_uring: validate the full range of provided buffers for access Account for the number of provided buffers when validating the address range. Signed-off-by: Bijan Mottahedeh Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 70f0f2f940fb..5431b182b6b0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3183,7 +3183,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, p->addr = READ_ONCE(sqe->addr); p->len = READ_ONCE(sqe->len); - if (!access_ok(u64_to_user_ptr(p->addr), p->len)) + if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs))) return -EFAULT; p->bgid = READ_ONCE(sqe->buf_group); -- cgit v1.2.3-70-g09d2 From a8c73c1a614f6da6c0b04c393f87447e28cb6de4 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 5 Jun 2020 12:32:03 +0300 Subject: io_uring: use kvfree() in io_sqe_buffer_register() Use kvfree() to free the pages and vmas, since they are allocated by kvmalloc_array() in a loop. Fixes: d4ef647510b1 ("io_uring: avoid page allocation warnings") Signed-off-by: Denis Efremov Signed-off-by: Jens Axboe Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200605093203.40087-1-efremov@linux.com --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5431b182b6b0..5e36e78e766e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7171,8 +7171,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ret = 0; if (!pages || nr_pages > got_pages) { - kfree(vmas); - kfree(pages); + kvfree(vmas); + kvfree(pages); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); vmas = kvmalloc_array(nr_pages, -- cgit v1.2.3-70-g09d2 From 3af73b286ccee493dc055fc58da02b2dc7a5304d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Jun 2020 21:08:17 +0300 Subject: io_uring: don't derive close state from ->func Relying on having a specific work.func is dangerous, even if an opcode handler set it itself. E.g. io_wq_assign_next() can modify it. io_close() sets a custom work.func to indicate that __close_fd_get_file() was already called. Fortunately, there is no bugs with io_wq_assign_next() and close yet. Still, do it safe and always be prepared to be called through io_wq_submit_work(). Zero req->close.put_file in prep, and call __close_fd_get_file() IFF it's NULL. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 50 +++++++++++++++++--------------------------------- 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5e36e78e766e..43721f046f03 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3437,53 +3437,37 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->close.fd == req->ctx->ring_fd) return -EBADF; + req->close.put_file = NULL; return 0; } -/* only called when __close_fd_get_file() is done */ -static void __io_close_finish(struct io_kiocb *req) -{ - int ret; - - ret = filp_close(req->close.put_file, req->work.files); - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - fput(req->close.put_file); - io_put_req(req); -} - -static void io_close_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - /* not cancellable, don't do io_req_cancelled() */ - __io_close_finish(req); - io_steal_work(req, workptr); -} - static int io_close(struct io_kiocb *req, bool force_nonblock) { + struct io_close *close = &req->close; int ret; - req->close.put_file = NULL; - ret = __close_fd_get_file(req->close.fd, &req->close.put_file); - if (ret < 0) - return (ret == -ENOENT) ? -EBADF : ret; + /* might be already done during nonblock submission */ + if (!close->put_file) { + ret = __close_fd_get_file(close->fd, &close->put_file); + if (ret < 0) + return (ret == -ENOENT) ? -EBADF : ret; + } /* if the file has a flush method, be safe and punt to async */ - if (req->close.put_file->f_op->flush && force_nonblock) { + if (close->put_file->f_op->flush && force_nonblock) { /* avoid grabbing files - we don't need the files */ req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT; - req->work.func = io_close_finish; return -EAGAIN; } - /* - * No ->flush(), safely close from here and just punt the - * fput() to async context. - */ - __io_close_finish(req); + /* No ->flush() or already async, safely close from here */ + ret = filp_close(close->put_file, req->work.files); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + fput(close->put_file); + close->put_file = NULL; + io_put_req(req); return 0; } -- cgit v1.2.3-70-g09d2 From ac45abc0e2a8ed16ecc0eea039fe762ddfefbcad Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Jun 2020 21:08:18 +0300 Subject: io_uring: remove custom ->func handlers In preparation of getting rid of work.func, this removes almost all custom instances of it, leaving only io_wq_submit_work() and io_link_work_cb(). And the last one will be dealt later. Nothing fancy, just routinely remove *_finish() function and inline what's left. E.g. remove io_fsync_finish() + inline __io_fsync() into io_fsync(). As no users of io_req_cancelled() are left, delete it as well. The patch adds extra switch lookup on cold-ish path, but that's overweighted by nice diffstat and other benefits of the following patches. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 139 ++++++++++++---------------------------------------------- 1 file changed, 27 insertions(+), 112 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 43721f046f03..42a90e8831bf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2898,23 +2898,15 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static bool io_req_cancelled(struct io_kiocb *req) -{ - if (req->work.flags & IO_WQ_WORK_CANCEL) { - req_set_fail_links(req); - io_cqring_add_event(req, -ECANCELED); - io_put_req(req); - return true; - } - - return false; -} - -static void __io_fsync(struct io_kiocb *req) +static int io_fsync(struct io_kiocb *req, bool force_nonblock) { loff_t end = req->sync.off + req->sync.len; int ret; + /* fsync always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; + ret = vfs_fsync_range(req->file, req->sync.off, end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); @@ -2922,53 +2914,9 @@ static void __io_fsync(struct io_kiocb *req) req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req(req); -} - -static void io_fsync_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - if (io_req_cancelled(req)) - return; - __io_fsync(req); - io_steal_work(req, workptr); -} - -static int io_fsync(struct io_kiocb *req, bool force_nonblock) -{ - /* fsync always requires a blocking context */ - if (force_nonblock) { - req->work.func = io_fsync_finish; - return -EAGAIN; - } - __io_fsync(req); return 0; } -static void __io_fallocate(struct io_kiocb *req) -{ - int ret; - - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; - ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, - req->sync.len); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); -} - -static void io_fallocate_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - if (io_req_cancelled(req)) - return; - __io_fallocate(req); - io_steal_work(req, workptr); -} - static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -2986,13 +2934,20 @@ static int io_fallocate_prep(struct io_kiocb *req, static int io_fallocate(struct io_kiocb *req, bool force_nonblock) { + int ret; + /* fallocate always requiring blocking context */ - if (force_nonblock) { - req->work.func = io_fallocate_finish; + if (force_nonblock) return -EAGAIN; - } - __io_fallocate(req); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, + req->sync.len); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); return 0; } @@ -3489,38 +3444,20 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static void __io_sync_file_range(struct io_kiocb *req) +static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) { int ret; + /* sync_file_range always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; + ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req(req); -} - - -static void io_sync_file_range_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - if (io_req_cancelled(req)) - return; - __io_sync_file_range(req); - io_steal_work(req, workptr); -} - -static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) -{ - /* sync_file_range always requires a blocking context */ - if (force_nonblock) { - req->work.func = io_sync_file_range_finish; - return -EAGAIN; - } - - __io_sync_file_range(req); return 0; } @@ -3942,49 +3879,27 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int __io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock) { struct io_accept *accept = &req->accept; - unsigned file_flags; + unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; int ret; - file_flags = force_nonblock ? O_NONBLOCK : 0; ret = __sys_accept4_file(req->file, file_flags, accept->addr, accept->addr_len, accept->flags, accept->nofile); if (ret == -EAGAIN && force_nonblock) return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret < 0) + if (ret < 0) { + if (ret == -ERESTARTSYS) + ret = -EINTR; req_set_fail_links(req); + } io_cqring_add_event(req, ret); io_put_req(req); return 0; } -static void io_accept_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - if (io_req_cancelled(req)) - return; - __io_accept(req, false); - io_steal_work(req, workptr); -} - -static int io_accept(struct io_kiocb *req, bool force_nonblock) -{ - int ret; - - ret = __io_accept(req, force_nonblock); - if (ret == -EAGAIN && force_nonblock) { - req->work.func = io_accept_finish; - return -EAGAIN; - } - return 0; -} - static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; -- cgit v1.2.3-70-g09d2 From d4c81f38522f3e7f4be1b472ef9988d0ed7f3696 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Jun 2020 21:08:19 +0300 Subject: io_uring: don't arm a timeout through work.func Remove io_link_work_cb() -- the last custom work.func. Not the prettiest thing, but works. Instead of queueing a linked timeout in io_link_work_cb() mark a request with REQ_F_QUEUE_TIMEOUT and do enqueueing based on the flag in io_wq_submit_work(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 42a90e8831bf..35d96d2a4c8c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -541,6 +541,7 @@ enum { REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, + REQ_F_QUEUE_TIMEOUT_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -596,6 +597,8 @@ enum { REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), /* doesn't need file table for this request */ REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), + /* needs to queue linked timeout */ + REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), }; struct async_poll { @@ -1580,16 +1583,6 @@ static void io_free_req(struct io_kiocb *req) io_queue_async_work(nxt); } -static void io_link_work_cb(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *link; - - link = list_first_entry(&req->link_list, struct io_kiocb, link_list); - io_queue_linked_timeout(link); - io_wq_submit_work(workptr); -} - static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) { struct io_kiocb *link; @@ -1601,7 +1594,7 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) *workptr = &nxt->work; link = io_prep_linked_timeout(nxt); if (link) - nxt->work.func = io_link_work_cb; + nxt->flags |= REQ_F_QUEUE_TIMEOUT; } /* @@ -5291,12 +5284,26 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static void io_arm_async_linked_timeout(struct io_kiocb *req) +{ + struct io_kiocb *link; + + /* link head's timeout is queued in io_queue_async_work() */ + if (!(req->flags & REQ_F_QUEUE_TIMEOUT)) + return; + + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); + io_queue_linked_timeout(link); +} + static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); int ret = 0; + io_arm_async_linked_timeout(req); + /* if NO_CANCEL is set, we must still run the work */ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == IO_WQ_WORK_CANCEL) { -- cgit v1.2.3-70-g09d2 From f5fa38c59cb0b40633dee5cdf7465801be3e4928 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Jun 2020 21:08:20 +0300 Subject: io_wq: add per-wq work handler instead of per work io_uring is the only user of io-wq, and now it uses only io-wq callback for all its requests, namely io_wq_submit_work(). Instead of storing work->runner callback in each instance of io_wq_work, keep it in io-wq itself. pros: - reduces io_wq_work size - more robust -- ->func won't be invalidated with mem{cpy,set}(req) - helps other work Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 10 ++++++---- fs/io-wq.h | 7 ++++--- fs/io_uring.c | 3 ++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 4023c9846860..d7dc638f4b8e 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -112,6 +112,7 @@ struct io_wq { unsigned long state; free_work_fn *free_work; + io_wq_work_fn *do_work; struct task_struct *manager; struct user_struct *user; @@ -528,7 +529,7 @@ get_next: hash = io_get_work_hash(work); linked = old_work = work; - linked->func(&linked); + wq->do_work(&linked); linked = (old_work == linked) ? NULL : linked; work = next_hashed; @@ -785,7 +786,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) struct io_wq_work *old_work = work; work->flags |= IO_WQ_WORK_CANCEL; - work->func(&work); + wq->do_work(&work); work = (work == old_work) ? NULL : work; wq->free_work(old_work); } while (work); @@ -1023,7 +1024,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret = -ENOMEM, node; struct io_wq *wq; - if (WARN_ON_ONCE(!data->free_work)) + if (WARN_ON_ONCE(!data->free_work || !data->do_work)) return ERR_PTR(-EINVAL); wq = kzalloc(sizeof(*wq), GFP_KERNEL); @@ -1037,6 +1038,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) } wq->free_work = data->free_work; + wq->do_work = data->do_work; /* caller must already hold a reference to this */ wq->user = data->user; @@ -1093,7 +1095,7 @@ err: bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) { - if (data->free_work != wq->free_work) + if (data->free_work != wq->free_work || data->do_work != wq->do_work) return false; return refcount_inc_not_zero(&wq->use_refs); diff --git a/fs/io-wq.h b/fs/io-wq.h index 5ba12de7572f..2db24d31fbc5 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -85,7 +85,6 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; - void (*func)(struct io_wq_work **); struct files_struct *files; struct mm_struct *mm; const struct cred *creds; @@ -94,9 +93,9 @@ struct io_wq_work { pid_t task_pid; }; -#define INIT_IO_WORK(work, _func) \ +#define INIT_IO_WORK(work) \ do { \ - *(work) = (struct io_wq_work){ .func = _func }; \ + *(work) = (struct io_wq_work){}; \ } while (0) \ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) @@ -108,10 +107,12 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) } typedef void (free_work_fn)(struct io_wq_work *); +typedef void (io_wq_work_fn)(struct io_wq_work **); struct io_wq_data { struct user_struct *user; + io_wq_work_fn *do_work; free_work_fn *free_work; }; diff --git a/fs/io_uring.c b/fs/io_uring.c index 35d96d2a4c8c..3ffe03194c1e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5776,7 +5776,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, refcount_set(&req->refs, 2); req->task = NULL; req->result = 0; - INIT_IO_WORK(&req->work, io_wq_submit_work); + INIT_IO_WORK(&req->work); if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; @@ -6796,6 +6796,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx, data.user = ctx->user; data.free_work = io_free_work; + data.do_work = io_wq_submit_work; if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { /* Do QD, or 4 * CPUS, whatever is smallest */ -- cgit v1.2.3-70-g09d2 From c5b856255cbc3b664d686a83fa9397a835e063de Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Jun 2020 19:23:05 -0600 Subject: io_uring: allow O_NONBLOCK async retry We can assume that O_NONBLOCK is always honored, even if we don't have a ->read/write_iter() for the file type. Also unify the read/write checking for allowing async punt, having the write side factoring in the REQ_F_NOWAIT flag as well. Cc: stable@vger.kernel.org Fixes: 490e89676a52 ("io_uring: only force async punt if poll based retry can't handle it") Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3ffe03194c1e..ebea82e09963 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2061,6 +2061,10 @@ static bool io_file_supports_async(struct file *file, int rw) if (S_ISREG(mode) && file->f_op != &io_uring_fops) return true; + /* any ->read/write should understand O_NONBLOCK */ + if (file->f_flags & O_NONBLOCK) + return true; + if (!(file->f_mode & FMODE_NOWAIT)) return false; @@ -2103,8 +2107,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_ioprio = get_current_ioprio(); /* don't allow async punt if RWF_NOWAIT was requested */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - (req->file->f_flags & O_NONBLOCK)) + if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; if (force_nonblock) @@ -2745,7 +2748,8 @@ copy_iov: if (ret) goto out_free; /* any defer here is final, must blocking retry */ - if (!file_can_poll(req->file)) + if (!(req->flags & REQ_F_NOWAIT) && + !file_can_poll(req->file)) req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } -- cgit v1.2.3-70-g09d2 From 7cdaf587de7c6f494b8433fded19f7728e70e1ef Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Wed, 10 Jun 2020 19:41:19 +0800 Subject: io_uring: avoid whole io_wq_work copy for requests completed inline If requests can be submitted and completed inline, we don't need to initialize whole io_wq_work in io_init_req(), which is an expensive operation, add a new 'REQ_F_WORK_INITIALIZED' to determine whether io_wq_work is initialized and add a helper io_req_init_async(), users must call io_req_init_async() for the first time touching any members of io_wq_work. I use /dev/nullb0 to evaluate performance improvement in my physical machine: modprobe null_blk nr_devices=1 completion_nsec=0 sudo taskset -c 60 fio -name=fiotest -filename=/dev/nullb0 -iodepth=128 -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1 -time_based -runtime=120 before this patch: Run status group 0 (all jobs): READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s), io=84.8GiB (91.1GB), run=120001-120001msec With this patch: Run status group 0 (all jobs): READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s), io=89.2GiB (95.8GB), run=120001-120001msec About 5% improvement. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io-wq.h | 5 ----- fs/io_uring.c | 40 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/fs/io-wq.h b/fs/io-wq.h index 2db24d31fbc5..8e138fa88b9f 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -93,11 +93,6 @@ struct io_wq_work { pid_t task_pid; }; -#define INIT_IO_WORK(work) \ - do { \ - *(work) = (struct io_wq_work){}; \ - } while (0) \ - static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) { if (!work->list.next) diff --git a/fs/io_uring.c b/fs/io_uring.c index ebea82e09963..5a2c004439f0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -542,6 +542,7 @@ enum { REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, REQ_F_QUEUE_TIMEOUT_BIT, + REQ_F_WORK_INITIALIZED_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -599,6 +600,8 @@ enum { REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), /* needs to queue linked timeout */ REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), + /* io_wq_work is initialized */ + REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), }; struct async_poll { @@ -911,6 +914,19 @@ EXPORT_SYMBOL(io_uring_get_socket); static void io_file_put_work(struct work_struct *work); +/* + * Note: must call io_req_init_async() for the first time you + * touch any members of io_wq_work. + */ +static inline void io_req_init_async(struct io_kiocb *req) +{ + if (req->flags & REQ_F_WORK_INITIALIZED) + return; + + memset(&req->work, 0, sizeof(req->work)); + req->flags |= REQ_F_WORK_INITIALIZED; +} + static inline bool io_async_submit(struct io_ring_ctx *ctx) { return ctx->flags & IORING_SETUP_SQPOLL; @@ -1037,6 +1053,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req, static inline void io_req_work_drop_env(struct io_kiocb *req) { + if (!(req->flags & REQ_F_WORK_INITIALIZED)) + return; + if (req->work.mm) { mmdrop(req->work.mm); req->work.mm = NULL; @@ -2785,8 +2804,14 @@ static int __io_splice_prep(struct io_kiocb *req, return ret; req->flags |= REQ_F_NEED_CLEANUP; - if (!S_ISREG(file_inode(sp->file_in)->i_mode)) + if (!S_ISREG(file_inode(sp->file_in)->i_mode)) { + /* + * Splice operation will be punted aync, and here need to + * modify io_wq_work.flags, so initialize io_wq_work firstly. + */ + io_req_init_async(req); req->work.flags |= IO_WQ_WORK_UNBOUND; + } return 0; } @@ -3372,8 +3397,10 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { /* * If we queue this for async, it must not be cancellable. That would - * leave the 'file' in an undeterminate state. + * leave the 'file' in an undeterminate state, and here need to modify + * io_wq_work.flags, so initialize io_wq_work firstly. */ + io_req_init_async(req); req->work.flags |= IO_WQ_WORK_NO_CANCEL; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) @@ -4851,6 +4878,8 @@ static int io_req_defer_prep(struct io_kiocb *req, if (!sqe) return 0; + io_req_init_async(req); + if (io_op_defs[req->opcode].file_table) { ret = io_grab_files(req); if (unlikely(ret)) @@ -5505,7 +5534,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) again: linked_timeout = io_prep_linked_timeout(req); - if (req->work.creds && req->work.creds != current_cred()) { + if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds && + req->work.creds != current_cred()) { if (old_creds) revert_creds(old_creds); if (old_creds == req->work.creds) @@ -5528,6 +5558,8 @@ again: goto exit; } punt: + io_req_init_async(req); + if (io_op_defs[req->opcode].file_table) { ret = io_grab_files(req); if (ret) @@ -5780,7 +5812,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, refcount_set(&req->refs, 2); req->task = NULL; req->result = 0; - INIT_IO_WORK(&req->work); if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; @@ -5802,6 +5833,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, id = READ_ONCE(sqe->personality); if (id) { + io_req_init_async(req); req->work.creds = idr_find(&ctx->personality_idr, id); if (unlikely(!req->work.creds)) return -EINVAL; -- cgit v1.2.3-70-g09d2 From 405a5d2b2762f2a9813efdee93274d4e7bf607a1 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Wed, 10 Jun 2020 19:41:20 +0800 Subject: io_uring: avoid unnecessary io_wq_work copy for fast poll feature Basically IORING_OP_POLL_ADD command and async armed poll handlers for regular commands don't touch io_wq_work, so only REQ_F_WORK_INITIALIZED is set, can we do io_wq_work copy and restore. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5a2c004439f0..d72b2a9463ca 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4262,7 +4262,8 @@ static void io_async_task_func(struct callback_head *cb) spin_unlock_irq(&ctx->completion_lock); /* restore ->work in case we need to retry again */ - memcpy(&req->work, &apoll->work, sizeof(req->work)); + if (req->flags & REQ_F_WORK_INITIALIZED) + memcpy(&req->work, &apoll->work, sizeof(req->work)); kfree(apoll); if (!canceled) { @@ -4359,7 +4360,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req) return false; req->flags |= REQ_F_POLLED; - memcpy(&apoll->work, &req->work, sizeof(req->work)); + if (req->flags & REQ_F_WORK_INITIALIZED) + memcpy(&apoll->work, &req->work, sizeof(req->work)); had_io = req->io != NULL; get_task_struct(current); @@ -4384,7 +4386,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (!had_io) io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); - memcpy(&req->work, &apoll->work, sizeof(req->work)); + if (req->flags & REQ_F_WORK_INITIALIZED) + memcpy(&req->work, &apoll->work, sizeof(req->work)); kfree(apoll); return false; } @@ -4429,7 +4432,9 @@ static bool io_poll_remove_one(struct io_kiocb *req) * io_req_work_drop_env below when dropping the * final reference. */ - memcpy(&req->work, &apoll->work, sizeof(req->work)); + if (req->flags & REQ_F_WORK_INITIALIZED) + memcpy(&req->work, &apoll->work, + sizeof(req->work)); kfree(apoll); } } -- cgit v1.2.3-70-g09d2 From e697deed834de15d2322d0619d51893022c90ea2 Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Wed, 10 Jun 2020 13:41:59 +0800 Subject: io_uring: check file O_NONBLOCK state for accept If the socket is O_NONBLOCK, we should complete the accept request with -EAGAIN when data is not ready. Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index d72b2a9463ca..5b0249140ff5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3909,6 +3909,9 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock) unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; int ret; + if (req->file->f_flags & O_NONBLOCK) + req->flags |= REQ_F_NOWAIT; + ret = __sys_accept4_file(req->file, file_flags, accept->addr, accept->addr_len, accept->flags, accept->nofile); -- cgit v1.2.3-70-g09d2 From 65a6543da386838f935d2f03f452c5c0acff2a68 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 11 Jun 2020 23:39:36 +0800 Subject: io_uring: fix io_kiocb.flags modification race in IOPOLL mode While testing io_uring in arm, we found sometimes io_sq_thread() keeps polling io requests even though there are not inflight io requests in block layer. After some investigations, found a possible race about io_kiocb.flags, see below race codes: 1) in the end of io_write() or io_read() req->flags &= ~REQ_F_NEED_CLEANUP; kfree(iovec); return ret; 2) in io_complete_rw_iopoll() if (res != -EAGAIN) req->flags |= REQ_F_IOPOLL_COMPLETED; In IOPOLL mode, io requests still maybe completed by interrupt, then above codes are not safe, concurrent modifications to req->flags, which is not protected by lock or is not atomic modifications. I also had disassemble io_complete_rw_iopoll() in arm: req->flags |= REQ_F_IOPOLL_COMPLETED; 0xffff000008387b18 <+76>: ldr w0, [x19,#104] 0xffff000008387b1c <+80>: orr w0, w0, #0x1000 0xffff000008387b20 <+84>: str w0, [x19,#104] Seems that the "req->flags |= REQ_F_IOPOLL_COMPLETED;" is load and modification, two instructions, which obviously is not atomic. To fix this issue, add a new iopoll_completed in io_kiocb to indicate whether io request is completed. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5b0249140ff5..61fca5afaac8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -529,7 +529,6 @@ enum { REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, - REQ_F_IOPOLL_COMPLETED_BIT, REQ_F_LINK_TIMEOUT_BIT, REQ_F_TIMEOUT_BIT, REQ_F_ISREG_BIT, @@ -574,8 +573,6 @@ enum { REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), /* must not punt to workers */ REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), - /* polled IO has completed */ - REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT), /* has linked timeout */ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), /* timeout request */ @@ -640,6 +637,8 @@ struct io_kiocb { struct io_async_ctx *io; int cflags; u8 opcode; + /* polled IO has completed */ + u8 iopoll_completed; u16 buf_index; @@ -1798,7 +1797,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * If we find a request that requires polling, break out * and complete those lists first, if we have entries there. */ - if (req->flags & REQ_F_IOPOLL_COMPLETED) { + if (READ_ONCE(req->iopoll_completed)) { list_move_tail(&req->list, &done); continue; } @@ -1979,7 +1978,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) req_set_fail_links(req); req->result = res; if (res != -EAGAIN) - req->flags |= REQ_F_IOPOLL_COMPLETED; + WRITE_ONCE(req->iopoll_completed, 1); } /* @@ -2012,7 +2011,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * For fast devices, IO may have already completed. If it has, add * it to the front so we find it first. */ - if (req->flags & REQ_F_IOPOLL_COMPLETED) + if (READ_ONCE(req->iopoll_completed)) list_add(&req->list, &ctx->poll_list); else list_add_tail(&req->list, &ctx->poll_list); @@ -2140,6 +2139,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; req->result = 0; + req->iopoll_completed = 0; } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; -- cgit v1.2.3-70-g09d2