diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-02-21 11:10:39 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-02-21 11:10:39 -0800 |
commit | 5bbb336ba75d95611a7b9456355b48705016bdb1 (patch) | |
tree | 4cce98f859254371afa50c82cf82d042255da96a /fs/io_uring.c | |
parent | 9820b4dca0f9c6b7ab8b4307286cdace171b724d (diff) | |
parent | 0b81e80c813f92520667c872d499a2dba8377be6 (diff) |
Merge tag 'for-5.12/io_uring-2021-02-17' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe:
"Highlights from this cycles are things like request recycling and
task_work optimizations, which net us anywhere from 10-20% of speedups
on workloads that mostly are inline.
This work was originally done to put io_uring under memcg, which adds
considerable overhead. But it's a really nice win as well. Also worth
highlighting is the LOOKUP_CACHED work in the VFS, and using it in
io_uring. Greatly speeds up the fast path for file opens.
Summary:
- Put io_uring under memcg protection. We accounted just the rings
themselves under rlimit memlock before, now we account everything.
- Request cache recycling, persistent across invocations (Pavel, me)
- First part of a cleanup/improvement to buffer registration (Bijan)
- SQPOLL fixes (Hao)
- File registration NULL pointer fixup (Dan)
- LOOKUP_CACHED support for io_uring
- Disable /proc/thread-self/ for io_uring, like we do for /proc/self
- Add Pavel to the io_uring MAINTAINERS entry
- Tons of code cleanups and optimizations (Pavel)
- Support for skip entries in file registration (Noah)"
* tag 'for-5.12/io_uring-2021-02-17' of git://git.kernel.dk/linux-block: (103 commits)
io_uring: tctx->task_lock should be IRQ safe
proc: don't allow async path resolution of /proc/thread-self components
io_uring: kill cached requests from exiting task closing the ring
io_uring: add helper to free all request caches
io_uring: allow task match to be passed to io_req_cache_free()
io-wq: clear out worker ->fs and ->files
io_uring: optimise io_init_req() flags setting
io_uring: clean io_req_find_next() fast check
io_uring: don't check PF_EXITING from syscall
io_uring: don't split out consume out of SQE get
io_uring: save ctx put/get for task_work submit
io_uring: don't duplicate io_req_task_queue()
io_uring: optimise SQPOLL mm/files grabbing
io_uring: optimise out unlikely link queue
io_uring: take compl state from submit state
io_uring: inline io_complete_rw_common()
io_uring: move res check out of io_rw_reissue()
io_uring: simplify iopoll reissuing
io_uring: clean up io_req_free_batch_finish()
io_uring: move submit side state closer in the ring
...
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r-- | fs/io_uring.c | 2606 |
1 files changed, 1328 insertions, 1278 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index 931671082e61..14ce789927e4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -187,6 +187,11 @@ struct io_rings { struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; +enum io_uring_cmd_flags { + IO_URING_F_NONBLOCK = 1, + IO_URING_F_COMPLETE_DEFER = 2, +}; + struct io_mapped_ubuf { u64 ubuf; size_t len; @@ -195,28 +200,38 @@ struct io_mapped_ubuf { unsigned long acct_pages; }; -struct fixed_file_table { +struct io_ring_ctx; + +struct io_rsrc_put { + struct list_head list; + union { + void *rsrc; + struct file *file; + }; +}; + +struct fixed_rsrc_table { struct file **files; }; -struct fixed_file_ref_node { +struct fixed_rsrc_ref_node { struct percpu_ref refs; struct list_head node; - struct list_head file_list; - struct fixed_file_data *file_data; + struct list_head rsrc_list; + struct fixed_rsrc_data *rsrc_data; + void (*rsrc_put)(struct io_ring_ctx *ctx, + struct io_rsrc_put *prsrc); struct llist_node llist; bool done; }; -struct fixed_file_data { - struct fixed_file_table *table; +struct fixed_rsrc_data { + struct fixed_rsrc_table *table; struct io_ring_ctx *ctx; - struct fixed_file_ref_node *node; + struct fixed_rsrc_ref_node *node; struct percpu_ref refs; struct completion done; - struct list_head ref_list; - spinlock_t lock; }; struct io_buffer { @@ -249,6 +264,46 @@ struct io_sq_data { unsigned sq_thread_idle; }; +#define IO_IOPOLL_BATCH 8 +#define IO_COMPL_BATCH 32 +#define IO_REQ_CACHE_SIZE 32 +#define IO_REQ_ALLOC_BATCH 8 + +struct io_comp_state { + struct io_kiocb *reqs[IO_COMPL_BATCH]; + unsigned int nr; + unsigned int locked_free_nr; + /* inline/task_work completion list, under ->uring_lock */ + struct list_head free_list; + /* IRQ completion list, under ->completion_lock */ + struct list_head locked_free_list; +}; + +struct io_submit_state { + struct blk_plug plug; + + /* + * io_kiocb alloc cache + */ + void *reqs[IO_REQ_CACHE_SIZE]; + unsigned int free_reqs; + + bool plug_started; + + /* + * Batch completion logic + */ + struct io_comp_state comp; + + /* + * File reference cache + */ + struct file *file; + unsigned int fd; + unsigned int file_refs; + unsigned int ios_left; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -291,6 +346,13 @@ struct io_ring_ctx { struct io_uring_sqe *sq_sqes; } ____cacheline_aligned_in_smp; + struct { + struct mutex uring_lock; + wait_queue_head_t wait; + } ____cacheline_aligned_in_smp; + + struct io_submit_state submit_state; + struct io_rings *rings; /* IO offload */ @@ -319,7 +381,7 @@ struct io_ring_ctx { * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ - struct fixed_file_data *file_data; + struct fixed_rsrc_data *file_data; unsigned nr_user_files; /* if used, fixed mapped user buffers */ @@ -338,9 +400,6 @@ struct io_ring_ctx { struct completion ref_comp; struct completion sq_thread_comp; - /* if all else fails... */ - struct io_kiocb *fallback_req; - #if defined(CONFIG_UNIX) struct socket *ring_sock; #endif @@ -362,11 +421,6 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; struct { - struct mutex uring_lock; - wait_queue_head_t wait; - } ____cacheline_aligned_in_smp; - - struct { spinlock_t completion_lock; /* @@ -384,11 +438,15 @@ struct io_ring_ctx { struct list_head inflight_list; } ____cacheline_aligned_in_smp; - struct delayed_work file_put_work; - struct llist_head file_put_llist; + struct delayed_work rsrc_put_work; + struct llist_head rsrc_put_llist; + struct list_head rsrc_ref_list; + spinlock_t rsrc_ref_lock; - struct work_struct exit_work; struct io_restriction restrictions; + + /* Keep this last, we don't need it for the fast path */ + struct work_struct exit_work; }; /* @@ -411,7 +469,6 @@ struct io_poll_remove { struct io_close { struct file *file; - struct file *put_file; int fd; }; @@ -489,13 +546,12 @@ struct io_sr_msg { struct io_open { struct file *file; int dfd; - bool ignore_nonblock; struct filename *filename; struct open_how how; unsigned long nofile; }; -struct io_files_update { +struct io_rsrc_update { struct file *file; u64 arg; u32 nr_args; @@ -584,7 +640,8 @@ struct io_async_connect { struct io_async_msghdr { struct iovec fast_iov[UIO_FASTIOV]; - struct iovec *iov; + /* points to an allocated iov, if NULL we use fast_iov instead */ + struct iovec *free_iov; struct sockaddr __user *uaddr; struct msghdr msg; struct sockaddr_storage addr; @@ -618,6 +675,7 @@ enum { REQ_F_NO_FILE_TABLE_BIT, REQ_F_WORK_INITIALIZED_BIT, REQ_F_LTIMEOUT_ACTIVE_BIT, + REQ_F_COMPLETE_INLINE_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -661,6 +719,8 @@ enum { REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), /* linked timeout is active, i.e. prepared by link's head */ REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), + /* completion is deferred through io_comp_state */ + REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), }; struct async_poll { @@ -668,6 +728,11 @@ struct async_poll { struct io_poll_iocb *double_poll; }; +struct io_task_work { + struct io_wq_work_node node; + task_work_func_t func; +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -689,7 +754,7 @@ struct io_kiocb { struct io_sr_msg sr_msg; struct io_open open; struct io_close close; - struct io_files_update files_update; + struct io_rsrc_update rsrc_update; struct io_fadvise fadvise; struct io_madvise madvise; struct io_epoll epoll; @@ -719,14 +784,17 @@ struct io_kiocb { u64 user_data; struct io_kiocb *link; - struct percpu_ref *fixed_file_refs; + struct percpu_ref *fixed_rsrc_refs; /* * 1. used with ctx->iopoll_list with reads/writes * 2. to track reqs with ->files (see io_op_def::file_table) */ struct list_head inflight_entry; - struct callback_head task_work; + union { + struct io_task_work io_task_work; + struct callback_head task_work; + }; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; struct async_poll *apoll; @@ -739,44 +807,9 @@ struct io_defer_entry { u32 seq; }; -#define IO_IOPOLL_BATCH 8 - -struct io_comp_state { - unsigned int nr; - struct list_head list; - struct io_ring_ctx *ctx; -}; - -struct io_submit_state { - struct blk_plug plug; - - /* - * io_kiocb alloc cache - */ - void *reqs[IO_IOPOLL_BATCH]; - unsigned int free_reqs; - - bool plug_started; - - /* - * Batch completion logic - */ - struct io_comp_state comp; - - /* - * File reference cache - */ - struct file *file; - unsigned int fd; - unsigned int file_refs; - unsigned int ios_left; -}; - struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; - /* don't fail if file grab fails */ - unsigned needs_file_no_error : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ @@ -908,8 +941,6 @@ static const struct io_op_def io_op_defs[] = { IO_WQ_WORK_FS | IO_WQ_WORK_MM, }, [IORING_OP_CLOSE] = { - .needs_file = 1, - .needs_file_no_error = 1, .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG, }, [IORING_OP_FILES_UPDATE] = { @@ -991,44 +1022,43 @@ static const struct io_op_def io_op_defs[] = { }, }; -enum io_mem_account { - ACCT_LOCKED, - ACCT_PINNED, -}; - -static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, - struct task_struct *task); - -static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node); -static struct fixed_file_ref_node *alloc_fixed_file_ref_node( +static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct task_struct *task, + struct files_struct *files); +static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node); +static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node( struct io_ring_ctx *ctx); +static void init_fixed_file_ref_node(struct io_ring_ctx *ctx, + struct fixed_rsrc_ref_node *ref_node); -static void __io_complete_rw(struct io_kiocb *req, long res, long res2, - struct io_comp_state *cs); +static bool io_rw_reissue(struct io_kiocb *req); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req, int nr); static void io_double_put_req(struct io_kiocb *req); +static void io_dismantle_req(struct io_kiocb *req); +static void io_put_task(struct task_struct *task, int nr); +static void io_queue_next(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void __io_queue_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, - struct io_uring_files_update *ip, + struct io_uring_rsrc_update *ip, unsigned nr_args); static void __io_clean_op(struct io_kiocb *req); static struct file *io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, bool fixed); -static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs); -static void io_file_put_work(struct work_struct *work); +static void __io_queue_sqe(struct io_kiocb *req); +static void io_rsrc_put_work(struct work_struct *work); -static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter, - bool needs_lock); +static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool needs_lock); static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter, bool force); -static void io_req_drop_files(struct io_kiocb *req); static void io_req_task_queue(struct io_kiocb *req); +static void io_submit_flush_completions(struct io_comp_state *cs, + struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; @@ -1060,9 +1090,9 @@ static inline void io_set_resource_node(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (!req->fixed_file_refs) { - req->fixed_file_refs = &ctx->file_data->node->refs; - percpu_ref_get(req->fixed_file_refs); + if (!req->fixed_rsrc_refs) { + req->fixed_rsrc_refs = &ctx->file_data->node->refs; + percpu_ref_get(req->fixed_rsrc_refs); } } @@ -1117,9 +1147,6 @@ static void io_sq_thread_drop_mm_files(void) static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) { - if (current->flags & PF_EXITING) - return -EFAULT; - if (!current->files) { struct files_struct *files; struct nsproxy *nsproxy; @@ -1147,15 +1174,9 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { struct mm_struct *mm; - if (current->flags & PF_EXITING) - return -EFAULT; if (current->mm) return 0; - /* Should never happen */ - if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL))) - return -EFAULT; - task_lock(ctx->sqo_task); mm = ctx->sqo_task->mm; if (unlikely(!mm || !mmget_not_zero(mm))) @@ -1170,8 +1191,8 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) return -EFAULT; } -static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, + struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; int ret; @@ -1191,6 +1212,14 @@ static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, return 0; } +static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (!(ctx->flags & IORING_SETUP_SQPOLL)) + return 0; + return __io_sq_thread_acquire_mm_files(ctx, req); +} + static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, struct cgroup_subsys_state **cur_css) @@ -1267,11 +1296,6 @@ static inline void io_req_init_async(struct io_kiocb *req) refcount_inc(&req->work.identity->count); } -static inline bool io_async_submit(struct io_ring_ctx *ctx) -{ - return ctx->flags & IORING_SETUP_SQPOLL; -} - static void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); @@ -1293,10 +1317,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx) return NULL; - ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL); - if (!ctx->fallback_req) - goto err; - /* * Use 5 bits less than the max cq entries, that should give us around * 32 entries per hash list if totally full and uniformly spread. @@ -1333,12 +1353,14 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->timeout_list); spin_lock_init(&ctx->inflight_lock); INIT_LIST_HEAD(&ctx->inflight_list); - INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); - init_llist_head(&ctx->file_put_llist); + spin_lock_init(&ctx->rsrc_ref_lock); + INIT_LIST_HEAD(&ctx->rsrc_ref_list); + INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); + init_llist_head(&ctx->rsrc_put_llist); + INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); + INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list); return ctx; err: - if (ctx->fallback_req) - kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx->cancel_hash); kfree(ctx); return NULL; @@ -1356,14 +1378,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -static void __io_commit_cqring(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - - /* order cqe stores with ring update */ - smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); -} - static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) { if (req->work.identity == &tctx->__identity) @@ -1377,22 +1391,14 @@ static void io_req_clean_work(struct io_kiocb *req) if (!(req->flags & REQ_F_WORK_INITIALIZED)) return; - req->flags &= ~REQ_F_WORK_INITIALIZED; - - if (req->work.flags & IO_WQ_WORK_MM) { + if (req->work.flags & IO_WQ_WORK_MM) mmdrop(req->work.identity->mm); - req->work.flags &= ~IO_WQ_WORK_MM; - } #ifdef CONFIG_BLK_CGROUP - if (req->work.flags & IO_WQ_WORK_BLKCG) { + if (req->work.flags & IO_WQ_WORK_BLKCG) css_put(req->work.identity->blkcg_css); - req->work.flags &= ~IO_WQ_WORK_BLKCG; - } #endif - if (req->work.flags & IO_WQ_WORK_CREDS) { + if (req->work.flags & IO_WQ_WORK_CREDS) put_cred(req->work.identity->creds); - req->work.flags &= ~IO_WQ_WORK_CREDS; - } if (req->work.flags & IO_WQ_WORK_FS) { struct fs_struct *fs = req->work.identity->fs; @@ -1402,11 +1408,27 @@ static void io_req_clean_work(struct io_kiocb *req) spin_unlock(&req->work.identity->fs->lock); if (fs) free_fs_struct(fs); - req->work.flags &= ~IO_WQ_WORK_FS; } - if (req->flags & REQ_F_INFLIGHT) - io_req_drop_files(req); + if (req->work.flags & IO_WQ_WORK_FILES) { + put_files_struct(req->work.identity->files); + put_nsproxy(req->work.identity->nsproxy); + } + if (req->flags & REQ_F_INFLIGHT) { + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_task *tctx = req->task->io_uring; + unsigned long flags; + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; + if (atomic_read(&tctx->in_idle)) + wake_up(&tctx->wait); + } + + req->flags &= ~REQ_F_WORK_INITIALIZED; + req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS | + IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES); io_put_identity(req->task->io_uring, req); } @@ -1455,11 +1477,24 @@ static bool io_identity_cow(struct io_kiocb *req) return true; } +static void io_req_track_inflight(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!(req->flags & REQ_F_INFLIGHT)) { + io_req_init_async(req); + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); + list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); + } +} + static bool io_grab_identity(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_identity *id = req->work.identity; - struct io_ring_ctx *ctx = req->ctx; if (def->work_flags & IO_WQ_WORK_FSIZE) { if (id->fsize != rlimit(RLIMIT_FSIZE)) @@ -1515,15 +1550,8 @@ static bool io_grab_identity(struct io_kiocb *req) return false; atomic_inc(&id->files->count); get_nsproxy(id->nsproxy); - - if (!(req->flags & REQ_F_INFLIGHT)) { - req->flags |= REQ_F_INFLIGHT; - - spin_lock_irq(&ctx->inflight_lock); - list_add(&req->inflight_entry, &ctx->inflight_list); - spin_unlock_irq(&ctx->inflight_lock); - } req->work.flags |= IO_WQ_WORK_FILES; + io_req_track_inflight(req); } if (!(req->work.flags & IO_WQ_WORK_MM) && (def->work_flags & IO_WQ_WORK_MM)) { @@ -1685,7 +1713,9 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) static void io_commit_cqring(struct io_ring_ctx *ctx) { io_flush_timeouts(ctx); - __io_commit_cqring(ctx); + + /* order cqe stores with ring update */ + smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); if (unlikely(!list_empty(&ctx->defer_list))) __io_queue_deferred(ctx); @@ -1698,21 +1728,25 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx) return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries; } +static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) +{ + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); +} + static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; unsigned tail; - tail = ctx->cached_cq_tail; /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ - if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries) + if (__io_cqring_events(ctx) == rings->cq_ring_entries) return NULL; - ctx->cached_cq_tail++; + tail = ctx->cached_cq_tail++; return &rings->cqes[tail & ctx->cq_mask]; } @@ -1727,11 +1761,6 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) return io_wq_current_is_worker(); } -static inline unsigned __io_cqring_events(struct io_ring_ctx *ctx) -{ - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); -} - static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { /* see waitqueue_active() comment */ @@ -1885,7 +1914,8 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) __io_cqring_fill_event(req, res, 0); } -static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags) +static inline void io_req_complete_post(struct io_kiocb *req, long res, + unsigned int cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; @@ -1893,92 +1923,96 @@ static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags) spin_lock_irqsave(&ctx->completion_lock, flags); __io_cqring_fill_event(req, res, cflags); io_commit_cqring(ctx); + /* + * If we're the last reference to this request, add to our locked + * free_list cache. + */ + if (refcount_dec_and_test(&req->refs)) { + struct io_comp_state *cs = &ctx->submit_state.comp; + + io_dismantle_req(req); + io_put_task(req->task, 1); + list_add(&req->compl.list, &cs->locked_free_list); + cs->locked_free_nr++; + } else + req = NULL; spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); -} - -static void io_submit_flush_completions(struct io_comp_state *cs) -{ - struct io_ring_ctx *ctx = cs->ctx; - - spin_lock_irq(&ctx->completion_lock); - while (!list_empty(&cs->list)) { - struct io_kiocb *req; - - req = list_first_entry(&cs->list, struct io_kiocb, compl.list); - list_del(&req->compl.list); - __io_cqring_fill_event(req, req->result, req->compl.cflags); - - /* - * io_free_req() doesn't care about completion_lock unless one - * of these flags is set. REQ_F_WORK_INITIALIZED is in the list - * because of a potential deadlock with req->work.fs->lock - */ - if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT - |REQ_F_WORK_INITIALIZED)) { - spin_unlock_irq(&ctx->completion_lock); - io_put_req(req); - spin_lock_irq(&ctx->completion_lock); - } else { - io_put_req(req); - } + if (req) { + io_queue_next(req); + percpu_ref_put(&ctx->refs); } - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); - cs->nr = 0; } -static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, - struct io_comp_state *cs) +static void io_req_complete_state(struct io_kiocb *req, long res, + unsigned int cflags) { - if (!cs) { - io_cqring_add_event(req, res, cflags); - io_put_req(req); - } else { - io_clean_op(req); - req->result = res; - req->compl.cflags = cflags; - list_add_tail(&req->compl.list, &cs->list); - if (++cs->nr >= 32) - io_submit_flush_completions(cs); - } + io_clean_op(req); + req->result = res; + req->compl.cflags = cflags; + req->flags |= REQ_F_COMPLETE_INLINE; } -static void io_req_complete(struct io_kiocb *req, long res) +static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, + long res, unsigned cflags) { - __io_req_complete(req, res, 0, NULL); + if (issue_flags & IO_URING_F_COMPLETE_DEFER) + io_req_complete_state(req, res, cflags); + else + io_req_complete_post(req, res, cflags); } -static inline bool io_is_fallback_req(struct io_kiocb *req) +static inline void io_req_complete(struct io_kiocb *req, long res) { - return req == (struct io_kiocb *) - ((unsigned long) req->ctx->fallback_req & ~1UL); + __io_req_complete(req, 0, res, 0); } -static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) +static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) { - struct io_kiocb *req; + struct io_submit_state *state = &ctx->submit_state; + struct io_comp_state *cs = &state->comp; + struct io_kiocb *req = NULL; - req = ctx->fallback_req; - if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req)) - return req; + /* + * If we have more than a batch's worth of requests in our IRQ side + * locked cache, grab the lock and move them over to our submission + * side cache. + */ + if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) { + spin_lock_irq(&ctx->completion_lock); + list_splice_init(&cs->locked_free_list, &cs->free_list); + cs->locked_free_nr = 0; + spin_unlock_irq(&ctx->completion_lock); + } - return NULL; + while (!list_empty(&cs->free_list)) { + req = list_first_entry(&cs->free_list, struct io_kiocb, + compl.list); + list_del(&req->compl.list); + state->reqs[state->free_reqs++] = req; + if (state->free_reqs == ARRAY_SIZE(state->reqs)) + break; + } + + return req != NULL; } -static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, - struct io_submit_state *state) +static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) { + struct io_submit_state *state = &ctx->submit_state; + + BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs)); + if (!state->free_reqs) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; - size_t sz; int ret; - sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs)); - ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs); + if (io_flush_cached_reqs(ctx)) + goto got_req; + + ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, + state->reqs); /* * Bulk alloc is all-or-nothing. If we fail to get a batch, @@ -1987,16 +2021,14 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, if (unlikely(ret <= 0)) { state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); if (!state->reqs[0]) - goto fallback; + return NULL; ret = 1; } state->free_reqs = ret; } - +got_req: state->free_reqs--; return state->reqs[state->free_reqs]; -fallback: - return io_get_fallback_req(ctx); } static inline void io_put_file(struct io_kiocb *req, struct file *file, @@ -2014,27 +2046,29 @@ static void io_dismantle_req(struct io_kiocb *req) kfree(req->async_data); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - if (req->fixed_file_refs) - percpu_ref_put(req->fixed_file_refs); + if (req->fixed_rsrc_refs) + percpu_ref_put(req->fixed_rsrc_refs); io_req_clean_work(req); } +static inline void io_put_task(struct task_struct *task, int nr) +{ + struct io_uring_task *tctx = task->io_uring; + + percpu_counter_sub(&tctx->inflight, nr); + if (unlikely(atomic_read(&tctx->in_idle))) + wake_up(&tctx->wait); + put_task_struct_many(task, nr); +} + static void __io_free_req(struct io_kiocb *req) { - struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; io_dismantle_req(req); + io_put_task(req->task, 1); - percpu_counter_dec(&tctx->inflight); - if (atomic_read(&tctx->in_idle)) - wake_up(&tctx->wait); - put_task_struct(req->task); - - if (likely(!io_is_fallback_req(req))) - kmem_cache_free(req_cachep, req); - else - clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req); + kmem_cache_free(req_cachep, req); percpu_ref_put(&ctx->refs); } @@ -2140,11 +2174,105 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { - if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT))) + if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) return NULL; return __io_req_find_next(req); } +static bool __tctx_task_work(struct io_uring_task *tctx) +{ + struct io_ring_ctx *ctx = NULL; + struct io_wq_work_list list; + struct io_wq_work_node *node; + + if (wq_list_empty(&tctx->task_list)) + return false; + + spin_lock_irq(&tctx->task_lock); + list = tctx->task_list; + INIT_WQ_LIST(&tctx->task_list); + spin_unlock_irq(&tctx->task_lock); + + node = list.first; + while (node) { + struct io_wq_work_node *next = node->next; + struct io_ring_ctx *this_ctx; + struct io_kiocb *req; + + req = container_of(node, struct io_kiocb, io_task_work.node); + this_ctx = req->ctx; + req->task_work.func(&req->task_work); + node = next; + + if (!ctx) { + ctx = this_ctx; + } else if (ctx != this_ctx) { + mutex_lock(&ctx->uring_lock); + io_submit_flush_completions(&ctx->submit_state.comp, ctx); + mutex_unlock(&ctx->uring_lock); + ctx = this_ctx; + } + } + + if (ctx && ctx->submit_state.comp.nr) { + mutex_lock(&ctx->uring_lock); + io_submit_flush_completions(&ctx->submit_state.comp, ctx); + mutex_unlock(&ctx->uring_lock); + } + + return list.first != NULL; +} + +static void tctx_task_work(struct callback_head *cb) +{ + struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); + + while (__tctx_task_work(tctx)) + cond_resched(); + + clear_bit(0, &tctx->task_state); +} + +static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req, + enum task_work_notify_mode notify) +{ + struct io_uring_task *tctx = tsk->io_uring; + struct io_wq_work_node *node, *prev; + unsigned long flags; + int ret; + + WARN_ON_ONCE(!tctx); + + spin_lock_irqsave(&tctx->task_lock, flags); + wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); + spin_unlock_irqrestore(&tctx->task_lock, flags); + + /* task_work already pending, we're done */ + if (test_bit(0, &tctx->task_state) || + test_and_set_bit(0, &tctx->task_state)) + return 0; + + if (!task_work_add(tsk, &tctx->task_work, notify)) + return 0; + + /* + * Slow path - we failed, find and delete work. if the work is not + * in the list, it got run and we're fine. + */ + ret = 0; + spin_lock_irqsave(&tctx->task_lock, flags); + wq_list_for_each(node, prev, &tctx->task_list) { + if (&req->io_task_work.node == node) { + wq_list_del(&tctx->task_list, node, prev); + ret = 1; + break; + } + } + spin_unlock_irqrestore(&tctx->task_lock, flags); + clear_bit(0, &tctx->task_state); + return ret; +} + static int io_req_task_work_add(struct io_kiocb *req) { struct task_struct *tsk = req->task; @@ -2165,13 +2293,23 @@ static int io_req_task_work_add(struct io_kiocb *req) if (!(ctx->flags & IORING_SETUP_SQPOLL)) notify = TWA_SIGNAL; - ret = task_work_add(tsk, &req->task_work, notify); + ret = io_task_work_add(tsk, req, notify); if (!ret) wake_up_process(tsk); return ret; } +static void io_req_task_work_add_fallback(struct io_kiocb *req, + task_work_func_t cb) +{ + struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq); + + init_task_work(&req->task_work, cb); + task_work_add(tsk, &req->task_work, TWA_NONE); + wake_up_process(tsk); +} + static void __io_req_task_cancel(struct io_kiocb *req, int error) { struct io_ring_ctx *ctx = req->ctx; @@ -2199,11 +2337,11 @@ static void __io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ mutex_lock(&ctx->uring_lock); - if (!ctx->sqo_dead && - !__io_sq_thread_acquire_mm(ctx) && - !__io_sq_thread_acquire_files(ctx)) - __io_queue_sqe(req, NULL); + if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && + !io_sq_thread_acquire_mm_files(ctx, req)) + __io_queue_sqe(req); else __io_req_task_cancel(req, -EFAULT); mutex_unlock(&ctx->uring_lock); @@ -2215,27 +2353,19 @@ static void __io_req_task_submit(struct io_kiocb *req) static void io_req_task_submit(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - struct io_ring_ctx *ctx = req->ctx; __io_req_task_submit(req); - percpu_ref_put(&ctx->refs); } static void io_req_task_queue(struct io_kiocb *req) { int ret; - init_task_work(&req->task_work, io_req_task_submit); - percpu_ref_get(&req->ctx->refs); - + req->task_work.func = io_req_task_submit; ret = io_req_task_work_add(req); if (unlikely(ret)) { - struct task_struct *tsk; - - init_task_work(&req->task_work, io_req_task_cancel); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); + percpu_ref_get(&req->ctx->refs); + io_req_task_work_add_fallback(req, io_req_task_cancel); } } @@ -2254,70 +2384,75 @@ static void io_free_req(struct io_kiocb *req) } struct req_batch { - void *reqs[IO_IOPOLL_BATCH]; - int to_free; - struct task_struct *task; int task_refs; + int ctx_refs; }; static inline void io_init_req_batch(struct req_batch *rb) { - rb->to_free = 0; rb->task_refs = 0; + rb->ctx_refs = 0; rb->task = NULL; } -static void __io_req_free_batch_flush(struct io_ring_ctx *ctx, - struct req_batch *rb) -{ - kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); - percpu_ref_put_many(&ctx->refs, rb->to_free); - rb->to_free = 0; -} - static void io_req_free_batch_finish(struct io_ring_ctx *ctx, struct req_batch *rb) { - if (rb->to_free) - __io_req_free_batch_flush(ctx, rb); - if (rb->task) { - struct io_uring_task *tctx = rb->task->io_uring; - - percpu_counter_sub(&tctx->inflight, rb->task_refs); - if (atomic_read(&tctx->in_idle)) - wake_up(&tctx->wait); - put_task_struct_many(rb->task, rb->task_refs); - rb->task = NULL; - } + if (rb->task) + io_put_task(rb->task, rb->task_refs); + if (rb->ctx_refs) + percpu_ref_put_many(&ctx->refs, rb->ctx_refs); } -static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) +static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, + struct io_submit_state *state) { - if (unlikely(io_is_fallback_req(req))) { - io_free_req(req); - return; - } io_queue_next(req); if (req->task != rb->task) { - if (rb->task) { - struct io_uring_task *tctx = rb->task->io_uring; - - percpu_counter_sub(&tctx->inflight, rb->task_refs); - if (atomic_read(&tctx->in_idle)) - wake_up(&tctx->wait); - put_task_struct_many(rb->task, rb->task_refs); - } + if (rb->task) + io_put_task(rb->task, rb->task_refs); rb->task = req->task; rb->task_refs = 0; } rb->task_refs++; + rb->ctx_refs++; io_dismantle_req(req); - rb->reqs[rb->to_free++] = req; - if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) - __io_req_free_batch_flush(req->ctx, rb); + if (state->free_reqs != ARRAY_SIZE(state->reqs)) + state->reqs[state->free_reqs++] = req; + else + list_add(&req->compl.list, &state->comp.free_list); +} + +static void io_submit_flush_completions(struct io_comp_state *cs, + struct io_ring_ctx *ctx) +{ + int i, nr = cs->nr; + struct io_kiocb *req; + struct req_batch rb; + + io_init_req_batch(&rb); + spin_lock_irq(&ctx->completion_lock); + for (i = 0; i < nr; i++) { + req = cs->reqs[i]; + __io_cqring_fill_event(req, req->result, req->compl.cflags); + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + for (i = 0; i < nr; i++) { + req = cs->reqs[i]; + + /* submission and completion refs */ + if (refcount_sub_and_test(2, &req->refs)) + io_req_free_batch(&rb, req, &ctx->submit_state); + } + + io_req_free_batch_finish(ctx, &rb); + cs->nr = 0; } /* @@ -2352,15 +2487,10 @@ static void io_free_req_deferred(struct io_kiocb *req) { int ret; - init_task_work(&req->task_work, io_put_req_deferred_cb); + req->task_work.func = io_put_req_deferred_cb; ret = io_req_task_work_add(req); - if (unlikely(ret)) { - struct task_struct *tsk; - - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); - } + if (unlikely(ret)) + io_req_task_work_add_fallback(req, io_put_req_deferred_cb); } static inline void io_put_req_deferred(struct io_kiocb *req, int refs) @@ -2369,22 +2499,6 @@ static inline void io_put_req_deferred(struct io_kiocb *req, int refs) io_free_req_deferred(req); } -static struct io_wq_work *io_steal_work(struct io_kiocb *req) -{ - struct io_kiocb *nxt; - - /* - * A ref is owned by io-wq in which context we're. So, if that's the - * last one, it's safe to steal next work. False negatives are Ok, - * it just will be re-punted async in io_put_work() - */ - if (refcount_read(&req->refs) != 1) - return NULL; - - nxt = io_req_find_next(req); - return nxt ? &nxt->work : NULL; -} - static void io_double_put_req(struct io_kiocb *req) { /* drop both submit and complete references */ @@ -2443,17 +2557,6 @@ static inline bool io_run_task_work(void) return false; } -static void io_iopoll_queue(struct list_head *again) -{ - struct io_kiocb *req; - - do { - req = list_first_entry(again, struct io_kiocb, inflight_entry); - list_del(&req->inflight_entry); - __io_complete_rw(req, -EAGAIN, 0, NULL); - } while (!list_empty(again)); -} - /* * Find and free completed poll iocbs */ @@ -2462,7 +2565,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, { struct req_batch rb; struct io_kiocb *req; - LIST_HEAD(again); /* order with ->result store in io_complete_rw_iopoll() */ smp_rmb(); @@ -2472,13 +2574,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, int cflags = 0; req = list_first_entry(done, struct io_kiocb, inflight_entry); + list_del(&req->inflight_entry); + if (READ_ONCE(req->result) == -EAGAIN) { - req->result = 0; req->iopoll_completed = 0; - list_move_tail(&req->inflight_entry, &again); - continue; + if (io_rw_reissue(req)) + continue; } - list_del(&req->inflight_entry); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); @@ -2487,15 +2589,12 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, (*nr_events)++; if (refcount_dec_and_test(&req->refs)) - io_req_free_batch(&rb, req); + io_req_free_batch(&rb, req, &ctx->submit_state); } io_commit_cqring(ctx); io_cqring_ev_posted_iopoll(ctx); io_req_free_batch_finish(ctx, &rb); - - if (!list_empty(&again)) - io_iopoll_queue(&again); } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, @@ -2662,34 +2761,16 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } -static void io_complete_rw_common(struct kiocb *kiocb, long res, - struct io_comp_state *cs) -{ - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - int cflags = 0; - - if (kiocb->ki_flags & IOCB_WRITE) - kiocb_end_write(req); - - if (res != req->result) - req_set_fail_links(req); - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - __io_req_complete(req, res, cflags, cs); -} - #ifdef CONFIG_BLOCK -static bool io_resubmit_prep(struct io_kiocb *req, int error) +static bool io_resubmit_prep(struct io_kiocb *req) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - ssize_t ret = -ECANCELED; + int rw, ret; struct iov_iter iter; - int rw; - if (error) { - ret = error; - goto end_req; - } + /* already prepared */ + if (req->async_data) + return true; switch (req->opcode) { case IORING_OP_READV: @@ -2705,27 +2786,17 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) default: printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n", req->opcode); - goto end_req; + return false; } - if (!req->async_data) { - ret = io_import_iovec(rw, req, &iovec, &iter, false); - if (ret < 0) - goto end_req; - ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); - if (!ret) - return true; - kfree(iovec); - } else { - return true; - } -end_req: - req_set_fail_links(req); - return false; + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + return false; + return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false); } #endif -static bool io_rw_reissue(struct io_kiocb *req, long res) +static bool io_rw_reissue(struct io_kiocb *req) { #ifdef CONFIG_BLOCK umode_t mode = file_inode(req->file)->i_mode; @@ -2733,35 +2804,45 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) if (!S_ISBLK(mode) && !S_ISREG(mode)) return false; - if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) + if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker()) return false; lockdep_assert_held(&req->ctx->uring_lock); ret = io_sq_thread_acquire_mm_files(req->ctx, req); - if (io_resubmit_prep(req, ret)) { + if (!ret && io_resubmit_prep(req)) { refcount_inc(&req->refs); io_queue_async_work(req); return true; } - + req_set_fail_links(req); #endif return false; } static void __io_complete_rw(struct io_kiocb *req, long res, long res2, - struct io_comp_state *cs) + unsigned int issue_flags) { - if (!io_rw_reissue(req, res)) - io_complete_rw_common(&req->rw.kiocb, res, cs); + int cflags = 0; + + if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req)) + return; + if (res != req->result) + req_set_fail_links(req); + + if (req->rw.kiocb.ki_flags & IOCB_WRITE) + kiocb_end_write(req); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_rw_kbuf(req); + __io_req_complete(req, issue_flags, res, cflags); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - __io_complete_rw(req, res, res2, NULL); + __io_complete_rw(req, res, res2, 0); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -2825,16 +2906,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) wake_up(&ctx->sq_data->wait); } -static inline void __io_state_file_put(struct io_submit_state *state) -{ - fput_many(state->file, state->file_refs); - state->file_refs = 0; -} - static inline void io_state_file_put(struct io_submit_state *state) { - if (state->file_refs) - __io_state_file_put(state); + if (state->file_refs) { + fput_many(state->file, state->file_refs); + state->file_refs = 0; + } } /* @@ -2852,7 +2929,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) state->file_refs--; return state->file; } - __io_state_file_put(state); + io_state_file_put(state); } state->file = fget_many(fd, state->ios_left); if (unlikely(!state->file)) @@ -2910,16 +2987,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw.kiocb; + struct file *file = req->file; unsigned ioprio; int ret; - if (S_ISREG(file_inode(req->file)->i_mode)) + if (S_ISREG(file_inode(file)->i_mode)) req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); - if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) { + if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) { req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = req->file->f_pos; + kiocb->ki_pos = file->f_pos; } kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); @@ -2927,6 +3005,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(ret)) return ret; + /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */ + if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK)) + req->flags |= REQ_F_NOWAIT; + ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { ret = ioprio_check_cap(ioprio); @@ -2937,10 +3019,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) } else kiocb->ki_ioprio = get_current_ioprio(); - /* don't allow async punt if RWF_NOWAIT was requested */ - if (kiocb->ki_flags & IOCB_NOWAIT) - req->flags |= REQ_F_NOWAIT; - if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) @@ -2983,7 +3061,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } static void kiocb_done(struct kiocb *kiocb, ssize_t ret, - struct io_comp_state *cs) + unsigned int issue_flags) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_async_rw *io = req->async_data; @@ -2999,13 +3077,12 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; if (ret >= 0 && kiocb->ki_complete == io_complete_rw) - __io_complete_rw(req, ret, 0, cs); + __io_complete_rw(req, ret, 0, issue_flags); else io_rw_done(kiocb, ret); } -static ssize_t io_import_fixed(struct io_kiocb *req, int rw, - struct iov_iter *iter) +static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) { struct io_ring_ctx *ctx = req->ctx; size_t len = req->rw.len; @@ -3069,7 +3146,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, } } - return len; + return 0; } static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) @@ -3210,16 +3287,14 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, needs_lock); } -static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter, - bool needs_lock) +static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool needs_lock) { void __user *buf = u64_to_user_ptr(req->rw.addr); size_t sqe_len = req->rw.len; + u8 opcode = req->opcode; ssize_t ret; - u8 opcode; - opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; return io_import_fixed(req, rw, iter); @@ -3244,10 +3319,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->flags & REQ_F_BUFFER_SELECT) { ret = io_iov_buffer_select(req, *iovec, needs_lock); - if (!ret) { - ret = (*iovec)->iov_len; - iov_iter_init(iter, rw, *iovec, 1, ret); - } + if (!ret) + iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len); *iovec = NULL; return ret; } @@ -3365,8 +3438,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, if (!force && !io_op_defs[req->opcode].needs_async_data) return 0; if (!req->async_data) { - if (__io_alloc_async_data(req)) + if (__io_alloc_async_data(req)) { + kfree(iovec); return -ENOMEM; + } io_req_map_rw(req, iovec, fast_iov, iter); } @@ -3377,7 +3452,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) { struct io_async_rw *iorw = req->async_data; struct iovec *iov = iorw->fast_iov; - ssize_t ret; + int ret; ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); if (unlikely(ret < 0)) @@ -3423,7 +3498,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, struct wait_page_queue *wpq; struct io_kiocb *req = wait->private; struct wait_page_key *key = arg; - int ret; wpq = container_of(wait, struct wait_page_queue, wait); @@ -3433,21 +3507,9 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; list_del_init(&wait->entry); - init_task_work(&req->task_work, io_req_task_submit); - percpu_ref_get(&req->ctx->refs); - /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req); - if (unlikely(ret)) { - struct task_struct *tsk; - - /* queue just for cancelation */ - init_task_work(&req->task_work, io_req_task_cancel); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); - } + io_req_task_queue(req); return 1; } @@ -3504,15 +3566,14 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) return -EINVAL; } -static int io_read(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_read(struct io_kiocb *req, unsigned int issue_flags) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; struct io_async_rw *rw = req->async_data; ssize_t io_size, ret, ret2; - bool no_async; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; if (rw) { iter = &rw->iter; @@ -3524,7 +3585,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, } io_size = iov_iter_count(iter); req->result = io_size; - ret = 0; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) @@ -3532,88 +3592,72 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, else kiocb->ki_flags |= IOCB_NOWAIT; - /* If the file doesn't support async, just async punt */ - no_async = force_nonblock && !io_file_supports_async(req->file, READ); - if (no_async) - goto copy_iov; + if (force_nonblock && !io_file_supports_async(req->file, READ)) { + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); + return ret ?: -EAGAIN; + } ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size); - if (unlikely(ret)) - goto out_free; + if (unlikely(ret)) { + kfree(iovec); + return ret; + } ret = io_iter_do_read(req, iter); - if (!ret) { - goto done; - } else if (ret == -EIOCBQUEUED) { - ret = 0; - goto out_free; + if (ret == -EIOCBQUEUED) { + /* it's faster to check here then delegate to kfree */ + if (iovec) + kfree(iovec); + return 0; } else if (ret == -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; - /* no retry on NONBLOCK marked file */ - if (req->file->f_flags & O_NONBLOCK) + /* no retry on NONBLOCK nor RWF_NOWAIT */ + if (req->flags & REQ_F_NOWAIT) goto done; /* some cases will consume bytes even on error returns */ iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = 0; - goto copy_iov; - } else if (ret < 0) { - /* make sure -ERESTARTSYS -> -EINTR is done */ + } else if (ret <= 0 || ret == io_size || !force_nonblock || + (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) { + /* read all, failed, already did sync or don't want to retry */ goto done; } - /* read it all, or we did blocking attempt. no retry. */ - if (!iov_iter_count(iter) || !force_nonblock || - (req->file->f_flags & O_NONBLOCK) || !(req->flags & REQ_F_ISREG)) - goto done; - - io_size -= ret; -copy_iov: ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); - if (ret2) { - ret = ret2; - goto out_free; - } - if (no_async) - return -EAGAIN; + if (ret2) + return ret2; + rw = req->async_data; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; /* now use our persistent iterator, if we aren't already */ iter = &rw->iter; -retry: - rw->bytes_done += ret; - /* if we can retry, do so with the callbacks armed */ - if (!io_rw_should_retry(req)) { - kiocb->ki_flags &= ~IOCB_WAITQ; - return -EAGAIN; - } - /* - * Now retry read with the IOCB_WAITQ parts set in the iocb. If we - * get -EIOCBQUEUED, then we'll get a notification when the desired - * page gets unlocked. We can also get a partial read here, and if we - * do, then just retry at the new offset. - */ - ret = io_iter_do_read(req, iter); - if (ret == -EIOCBQUEUED) { - ret = 0; - goto out_free; - } else if (ret > 0 && ret < io_size) { + do { + io_size -= ret; + rw->bytes_done += ret; + /* if we can retry, do so with the callbacks armed */ + if (!io_rw_should_retry(req)) { + kiocb->ki_flags &= ~IOCB_WAITQ; + return -EAGAIN; + } + + /* + * Now retry read with the IOCB_WAITQ parts set in the iocb. If + * we get -EIOCBQUEUED, then we'll get a notification when the + * desired page gets unlocked. We can also get a partial read + * here, and if we do, then just retry at the new offset. + */ + ret = io_iter_do_read(req, iter); + if (ret == -EIOCBQUEUED) + return 0; /* we got some bytes, but not all. retry. */ - goto retry; - } + } while (ret > 0 && ret < io_size); done: - kiocb_done(kiocb, ret, cs); - ret = 0; -out_free: - /* it's reportedly faster than delegating the null check to kfree() */ - if (iovec) - kfree(iovec); - return ret; + kiocb_done(kiocb, ret, issue_flags); + return 0; } static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -3633,14 +3677,14 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_rw_prep_async(req, WRITE); } -static int io_write(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_write(struct io_kiocb *req, unsigned int issue_flags) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; struct io_async_rw *rw = req->async_data; ssize_t ret, ret2, io_size; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; if (rw) { iter = &rw->iter; @@ -3699,22 +3743,21 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, */ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; - /* no retry on NONBLOCK marked file */ - if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK)) + /* no retry on NONBLOCK nor RWF_NOWAIT */ + if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) goto copy_iov; done: - kiocb_done(kiocb, ret2, cs); + kiocb_done(kiocb, ret2, issue_flags); } else { copy_iov: /* some cases will consume bytes even on error returns */ iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); - if (!ret) - return -EAGAIN; + return ret ?: -EAGAIN; } out_free: /* it's reportedly faster than delegating the null check to kfree() */ @@ -3752,12 +3795,12 @@ static int io_renameat_prep(struct io_kiocb *req, return 0; } -static int io_renameat(struct io_kiocb *req, bool force_nonblock) +static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) { struct io_rename *ren = &req->rename; int ret; - if (force_nonblock) + if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, @@ -3794,12 +3837,12 @@ static int io_unlinkat_prep(struct io_kiocb *req, return 0; } -static int io_unlinkat(struct io_kiocb *req, bool force_nonblock) +static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_unlink *un = &req->unlink; int ret; - if (force_nonblock) + if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; if (un->flags & AT_REMOVEDIR) @@ -3831,13 +3874,13 @@ static int io_shutdown_prep(struct io_kiocb *req, #endif } -static int io_shutdown(struct io_kiocb *req, bool force_nonblock) +static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) { #if defined(CONFIG_NET) struct socket *sock; int ret; - if (force_nonblock) + if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; sock = sock_from_file(req->file); @@ -3896,7 +3939,7 @@ static int io_tee_prep(struct io_kiocb *req, return __io_splice_prep(req, sqe); } -static int io_tee(struct io_kiocb *req, bool force_nonblock) +static int io_tee(struct io_kiocb *req, unsigned int issue_flags) { struct io_splice *sp = &req->splice; struct file *in = sp->file_in; @@ -3904,7 +3947,7 @@ static int io_tee(struct io_kiocb *req, bool force_nonblock) unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; long ret = 0; - if (force_nonblock) + if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; if (sp->len) ret = do_tee(in, out, sp->len, flags); @@ -3927,7 +3970,7 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_splice_prep(req, sqe); } -static int io_splice(struct io_kiocb *req, bool force_nonblock) +static int io_splice(struct io_kiocb *req, unsigned int issue_flags) { struct io_splice *sp = &req->splice; struct file *in = sp->file_in; @@ -3936,7 +3979,7 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) loff_t *poff_in, *poff_out; long ret = 0; - if (force_nonblock) + if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; @@ -3957,14 +4000,14 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) /* * IORING_OP_NOP just posts a completion event, nothing else. */ -static int io_nop(struct io_kiocb *req, struct io_comp_state *cs) +static int io_nop(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - __io_req_complete(req, 0, 0, cs); + __io_req_complete(req, issue_flags, 0, 0); return 0; } @@ -3989,13 +4032,13 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_fsync(struct io_kiocb *req, bool force_nonblock) +static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) { loff_t end = req->sync.off + req->sync.len; int ret; /* fsync always requires a blocking context */ - if (force_nonblock) + if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; ret = vfs_fsync_range(req->file, req->sync.off, @@ -4021,12 +4064,12 @@ static int io_fallocate_prep(struct io_kiocb *req, return 0; } -static int io_fallocate(struct io_kiocb *req, bool force_nonblock) +static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) { int ret; /* fallocate always requiring blocking context */ - if (force_nonblock) + if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); @@ -4059,7 +4102,6 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return ret; } req->open.nofile = rlimit(RLIMIT_NOFILE); - req->open.ignore_nonblock = false; req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -4097,43 +4139,53 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_openat_prep(req, sqe); } -static int io_openat2(struct io_kiocb *req, bool force_nonblock) +static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct open_flags op; struct file *file; + bool nonblock_set; + bool resolve_nonblock; int ret; - if (force_nonblock && !req->open.ignore_nonblock) - return -EAGAIN; - ret = build_open_flags(&req->open.how, &op); if (ret) goto err; + nonblock_set = op.open_flag & O_NONBLOCK; + resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED; + if (issue_flags & IO_URING_F_NONBLOCK) { + /* + * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, + * it'll always -EAGAIN + */ + if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) + return -EAGAIN; + op.lookup_flags |= LOOKUP_CACHED; + op.open_flag |= O_NONBLOCK; + } ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); if (ret < 0) goto err; file = do_filp_open(req->open.dfd, req->open.filename, &op); + /* only retry if RESOLVE_CACHED wasn't already set by application */ + if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) && + file == ERR_PTR(-EAGAIN)) { + /* + * We could hang on to this 'fd', but seems like marginal + * gain for something that is now known to be a slower path. + * So just put it, and we'll get a new one when we retry. + */ + put_unused_fd(ret); + return -EAGAIN; + } + if (IS_ERR(file)) { put_unused_fd(ret); ret = PTR_ERR(file); - /* - * A work-around to ensure that /proc/self works that way - * that it should - if we get -EOPNOTSUPP back, then assume - * that proc_self_get_link() failed us because we're in async - * context. We should be safe to retry this from the task - * itself with force_nonblock == false set, as it should not - * block on lookup. Would be nice to know this upfront and - * avoid the async dance, but doesn't seem feasible. - */ - if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) { - req->open.ignore_nonblock = true; - refcount_inc(&req->refs); - io_req_task_queue(req); - return 0; - } } else { + if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) + file->f_flags &= ~O_NONBLOCK; fsnotify_open(file); fd_install(ret, file); } @@ -4146,9 +4198,9 @@ err: return 0; } -static int io_openat(struct io_kiocb *req, bool force_nonblock) +static int io_openat(struct io_kiocb *req, unsigned int issue_flags) { - return io_openat2(req, force_nonblock); + return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK); } static int io_remove_buffers_prep(struct io_kiocb *req, @@ -4196,13 +4248,13 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, return i; } -static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; struct io_buffer *head; int ret = 0; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; io_ring_submit_lock(ctx, !force_nonblock); @@ -4217,11 +4269,11 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock, /* need to hold the lock to complete IOPOLL requests */ if (ctx->flags & IORING_SETUP_IOPOLL) { - __io_req_complete(req, ret, 0, cs); + __io_req_complete(req, issue_flags, ret, 0); io_ring_submit_unlock(ctx, !force_nonblock); } else { io_ring_submit_unlock(ctx, !force_nonblock); - __io_req_complete(req, ret, 0, cs); + __io_req_complete(req, issue_flags, ret, 0); } return 0; } @@ -4280,13 +4332,13 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) return i ? i : -ENOMEM; } -static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; struct io_buffer *head, *list; int ret = 0; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; io_ring_submit_lock(ctx, !force_nonblock); @@ -4312,11 +4364,11 @@ out: /* need to hold the lock to complete IOPOLL requests */ if (ctx->flags & IORING_SETUP_IOPOLL) { - __io_req_complete(req, ret, 0, cs); + __io_req_complete(req, issue_flags, ret, 0); io_ring_submit_unlock(ctx, !force_nonblock); } else { io_ring_submit_unlock(ctx, !force_nonblock); - __io_req_complete(req, ret, 0, cs); + __io_req_complete(req, issue_flags, ret, 0); } return 0; } @@ -4348,12 +4400,12 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #endif } -static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) { #if defined(CONFIG_EPOLL) struct io_epoll *ie = &req->epoll; int ret; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); if (force_nonblock && ret == -EAGAIN) @@ -4361,7 +4413,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock, if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, 0, cs); + __io_req_complete(req, issue_flags, ret, 0); return 0; #else return -EOPNOTSUPP; @@ -4385,13 +4437,13 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #endif } -static int io_madvise(struct io_kiocb *req, bool force_nonblock) +static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = &req->madvise; int ret; - if (force_nonblock) + if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); @@ -4417,12 +4469,12 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_fadvise(struct io_kiocb *req, bool force_nonblock) +static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) { struct io_fadvise *fa = &req->fadvise; int ret; - if (force_nonblock) { + if (issue_flags & IO_URING_F_NONBLOCK) { switch (fa->advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: @@ -4458,12 +4510,12 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_statx(struct io_kiocb *req, bool force_nonblock) +static int io_statx(struct io_kiocb *req, unsigned int issue_flags) { struct io_statx *ctx = &req->statx; int ret; - if (force_nonblock) { + if (issue_flags & IO_URING_F_NONBLOCK) { /* only need file table for an actual valid fd */ if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD) req->flags |= REQ_F_NO_FILE_TABLE; @@ -4481,13 +4533,6 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock) static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - /* - * If we queue this for async, it must not be cancellable. That would - * leave the 'file' in an undeterminate state, and here need to modify - * io_wq_work.flags, so initialize io_wq_work firstly. - */ - io_req_init_async(req); - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || @@ -4497,44 +4542,59 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); - if ((req->file && req->file->f_op == &io_uring_fops)) - return -EBADF; - - req->close.put_file = NULL; return 0; } -static int io_close(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_close(struct io_kiocb *req, unsigned int issue_flags) { + struct files_struct *files = current->files; struct io_close *close = &req->close; + struct fdtable *fdt; + struct file *file; int ret; - /* might be already done during nonblock submission */ - if (!close->put_file) { - ret = close_fd_get_file(close->fd, &close->put_file); - if (ret < 0) - return (ret == -ENOENT) ? -EBADF : ret; + file = NULL; + ret = -EBADF; + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + if (close->fd >= fdt->max_fds) { + spin_unlock(&files->file_lock); + goto err; + } + file = fdt->fd[close->fd]; + if (!file) { + spin_unlock(&files->file_lock); + goto err; + } + + if (file->f_op == &io_uring_fops) { + spin_unlock(&files->file_lock); + file = NULL; + goto err; } /* if the file has a flush method, be safe and punt to async */ - if (close->put_file->f_op->flush && force_nonblock) { - /* not safe to cancel at this point */ - req->work.flags |= IO_WQ_WORK_NO_CANCEL; - /* was never set, but play safe */ - req->flags &= ~REQ_F_NOWAIT; - /* avoid grabbing files - we don't need the files */ - req->flags |= REQ_F_NO_FILE_TABLE; + if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { + spin_unlock(&files->file_lock); return -EAGAIN; } + ret = __close_fd_get_file(close->fd, &file); + spin_unlock(&files->file_lock); + if (ret < 0) { + if (ret == -ENOENT) + ret = -EBADF; + goto err; + } + /* No ->flush() or already async, safely close from here */ - ret = filp_close(close->put_file, req->work.identity->files); + ret = filp_close(file, current->files); +err: if (ret < 0) req_set_fail_links(req); - fput(close->put_file); - close->put_file = NULL; - __io_req_complete(req, ret, 0, cs); + if (file) + fput(file); + __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4556,12 +4616,12 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) +static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) { int ret; /* sync_file_range always requires a blocking context */ - if (force_nonblock) + if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; ret = sync_file_range(req->file, req->sync.off, req->sync.len, @@ -4581,23 +4641,27 @@ static int io_setup_async_msg(struct io_kiocb *req, if (async_msg) return -EAGAIN; if (io_alloc_async_data(req)) { - if (kmsg->iov != kmsg->fast_iov) - kfree(kmsg->iov); + kfree(kmsg->free_iov); return -ENOMEM; } async_msg = req->async_data; req->flags |= REQ_F_NEED_CLEANUP; memcpy(async_msg, kmsg, sizeof(*kmsg)); + async_msg->msg.msg_name = &async_msg->addr; + /* if were using fast_iov, set it to the new one */ + if (!async_msg->free_iov) + async_msg->msg.msg_iter.iov = async_msg->fast_iov; + return -EAGAIN; } static int io_sendmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { - iomsg->iov = iomsg->fast_iov; iomsg->msg.msg_name = &iomsg->addr; + iomsg->free_iov = iomsg->fast_iov; return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, - req->sr_msg.msg_flags, &iomsg->iov); + req->sr_msg.msg_flags, &iomsg->free_iov); } static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4626,8 +4690,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; struct socket *sock; @@ -4638,14 +4701,8 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, if (unlikely(!sock)) return -ENOTSOCK; - if (req->async_data) { - kmsg = req->async_data; - kmsg->msg.msg_name = &kmsg->addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { + kmsg = req->async_data; + if (!kmsg) { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) return ret; @@ -4655,26 +4712,26 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, flags = req->sr_msg.msg_flags; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) + else if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if (force_nonblock && ret == -EAGAIN) + if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; - if (kmsg->iov != kmsg->fast_iov) - kfree(kmsg->iov); + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, 0, cs); + __io_req_complete(req, issue_flags, ret, 0); return 0; } -static int io_send(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_send(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = &req->sr_msg; struct msghdr msg; @@ -4699,19 +4756,19 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, flags = req->sr_msg.msg_flags; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) + else if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; ret = sock_sendmsg(sock, &msg); - if (force_nonblock && ret == -EAGAIN) + if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, 0, cs); + __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4731,15 +4788,14 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, if (req->flags & REQ_F_BUFFER_SELECT) { if (iov_len > 1) return -EINVAL; - if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov))) + if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) return -EFAULT; - sr->len = iomsg->iov[0].iov_len; - iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1, - sr->len); - iomsg->iov = NULL; + sr->len = iomsg->fast_iov[0].iov_len; + iomsg->free_iov = NULL; } else { + iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &iomsg->iov, &iomsg->msg.msg_iter, + &iomsg->free_iov, &iomsg->msg.msg_iter, false); if (ret > 0) ret = 0; @@ -4778,11 +4834,11 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, if (clen < 0) return -EINVAL; sr->len = clen; - iomsg->iov[0].iov_len = clen; - iomsg->iov = NULL; + iomsg->free_iov = NULL; } else { + iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(READ, (struct iovec __user *)uiov, len, - UIO_FASTIOV, &iomsg->iov, + UIO_FASTIOV, &iomsg->free_iov, &iomsg->msg.msg_iter, true); if (ret < 0) return ret; @@ -4796,7 +4852,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { iomsg->msg.msg_name = &iomsg->addr; - iomsg->iov = iomsg->fast_iov; #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -4854,27 +4909,21 @@ static int io_recvmsg_prep(struct io_kiocb *req, return ret; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; struct socket *sock; struct io_buffer *kbuf; unsigned flags; int ret, cflags = 0; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; - if (req->async_data) { - kmsg = req->async_data; - kmsg->msg.msg_name = &kmsg->addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { + kmsg = req->async_data; + if (!kmsg) { ret = io_recvmsg_copy_hdr(req, &iomsg); if (ret) return ret; @@ -4886,7 +4935,8 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, + kmsg->fast_iov[0].iov_len = req->sr_msg.len; + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, req->sr_msg.len); } @@ -4905,17 +4955,17 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_recv_kbuf(req); - if (kmsg->iov != kmsg->fast_iov) - kfree(kmsg->iov); + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, cflags, cs); + __io_req_complete(req, issue_flags, ret, cflags); return 0; } -static int io_recv(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_buffer *kbuf; struct io_sr_msg *sr = &req->sr_msg; @@ -4925,6 +4975,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, struct iovec iov; unsigned flags; int ret, cflags = 0; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; sock = sock_from_file(req->file); if (unlikely(!sock)) @@ -4964,7 +5015,7 @@ out_free: cflags = io_put_recv_kbuf(req); if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, cflags, cs); + __io_req_complete(req, issue_flags, ret, cflags); return 0; } @@ -4984,10 +5035,10 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_accept(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_accept(struct io_kiocb *req, unsigned int issue_flags) { struct io_accept *accept = &req->accept; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; int ret; @@ -5004,7 +5055,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock, ret = -EINTR; req_set_fail_links(req); } - __io_req_complete(req, ret, 0, cs); + __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -5028,12 +5079,12 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) &io->address); } -static int io_connect(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_connect(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_connect __io, *io; unsigned file_flags; int ret; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; if (req->async_data) { io = req->async_data; @@ -5066,7 +5117,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock, out: if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, 0, cs); + __io_req_complete(req, issue_flags, ret, 0); return 0; } #else /* !CONFIG_NET */ @@ -5075,14 +5126,12 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { return -EOPNOTSUPP; } -static int io_send(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_send(struct io_kiocb *req, unsigned int issue_flags) { return -EOPNOTSUPP; } @@ -5093,14 +5142,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, return -EOPNOTSUPP; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { return -EOPNOTSUPP; } -static int io_recv(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_recv(struct io_kiocb *req, unsigned int issue_flags) { return -EOPNOTSUPP; } @@ -5110,8 +5157,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_accept(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_accept(struct io_kiocb *req, unsigned int issue_flags) { return -EOPNOTSUPP; } @@ -5121,8 +5167,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_connect(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_connect(struct io_kiocb *req, unsigned int issue_flags) { return -EOPNOTSUPP; } @@ -5148,7 +5193,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, list_del_init(&poll->wait.entry); req->result = mask; - init_task_work(&req->task_work, func); + req->task_work.func = func; percpu_ref_get(&req->ctx->refs); /* @@ -5159,12 +5204,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, */ ret = io_req_task_work_add(req); if (unlikely(ret)) { - struct task_struct *tsk; - WRITE_ONCE(poll->canceled, true); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); + io_req_task_work_add_fallback(req, func); } return 1; } @@ -5607,7 +5648,7 @@ static int io_poll_remove_prep(struct io_kiocb *req, * Find a running poll command that matches one specified in sqe->addr, * and remove it if found. */ -static int io_poll_remove(struct io_kiocb *req) +static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5658,7 +5699,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return 0; } -static int io_poll_add(struct io_kiocb *req) +static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; @@ -5789,24 +5830,27 @@ static int io_timeout_remove_prep(struct io_kiocb *req, return 0; } +static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) +{ + return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS + : HRTIMER_MODE_REL; +} + /* * Remove or update an existing timeout command */ -static int io_timeout_remove(struct io_kiocb *req) +static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) { struct io_timeout_rem *tr = &req->timeout_rem; struct io_ring_ctx *ctx = req->ctx; int ret; spin_lock_irq(&ctx->completion_lock); - if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) { - enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS) - ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; - - ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - } else { + if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) ret = io_timeout_cancel(ctx, tr->addr); - } + else + ret = io_timeout_update(ctx, tr->addr, &tr->ts, + io_translate_timeout_mode(tr->flags)); io_cqring_fill_event(req, ret); io_commit_cqring(ctx); @@ -5846,16 +5890,12 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; - if (flags & IORING_TIMEOUT_ABS) - data->mode = HRTIMER_MODE_ABS; - else - data->mode = HRTIMER_MODE_REL; - + data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); return 0; } -static int io_timeout(struct io_kiocb *req) +static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data = req->async_data; @@ -5978,7 +6018,7 @@ static int io_async_cancel_prep(struct io_kiocb *req, return 0; } -static int io_async_cancel(struct io_kiocb *req) +static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; @@ -5986,7 +6026,7 @@ static int io_async_cancel(struct io_kiocb *req) return 0; } -static int io_files_update_prep(struct io_kiocb *req, +static int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL)) @@ -5996,34 +6036,33 @@ static int io_files_update_prep(struct io_kiocb *req, if (sqe->ioprio || sqe->rw_flags) return -EINVAL; - req->files_update.offset = READ_ONCE(sqe->off); - req->files_update.nr_args = READ_ONCE(sqe->len); - if (!req->files_update.nr_args) + req->rsrc_update.offset = READ_ONCE(sqe->off); + req->rsrc_update.nr_args = READ_ONCE(sqe->len); + if (!req->rsrc_update.nr_args) return -EINVAL; - req->files_update.arg = READ_ONCE(sqe->addr); + req->rsrc_update.arg = READ_ONCE(sqe->addr); return 0; } -static int io_files_update(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_uring_files_update up; + struct io_uring_rsrc_update up; int ret; - if (force_nonblock) + if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - up.offset = req->files_update.offset; - up.fds = req->files_update.arg; + up.offset = req->rsrc_update.offset; + up.data = req->rsrc_update.arg; mutex_lock(&ctx->uring_lock); - ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args); + ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args); mutex_unlock(&ctx->uring_lock); if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, 0, cs); + __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -6073,7 +6112,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_CLOSE: return io_close_prep(req, sqe); case IORING_OP_FILES_UPDATE: - return io_files_update_prep(req, sqe); + return io_rsrc_update_prep(req, sqe); case IORING_OP_STATX: return io_statx_prep(req, sqe); case IORING_OP_FADVISE: @@ -6171,25 +6210,6 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EIOCBQUEUED; } -static void io_req_drop_files(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_uring_task *tctx = req->task->io_uring; - unsigned long flags; - - if (req->work.flags & IO_WQ_WORK_FILES) { - put_files_struct(req->work.identity->files); - put_nsproxy(req->work.identity->nsproxy); - } - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - req->flags &= ~REQ_F_INFLIGHT; - req->work.flags &= ~IO_WQ_WORK_FILES; - if (atomic_read(&tctx->in_idle)) - wake_up(&tctx->wait); -} - static void __io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { @@ -6223,8 +6243,8 @@ static void __io_clean_op(struct io_kiocb *req) case IORING_OP_RECVMSG: case IORING_OP_SENDMSG: { struct io_async_msghdr *io = req->async_data; - if (io->iov != io->fast_iov) - kfree(io->iov); + + kfree(io->free_iov); break; } case IORING_OP_SPLICE: @@ -6249,112 +6269,111 @@ static void __io_clean_op(struct io_kiocb *req) } } -static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, - struct io_comp_state *cs) +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; int ret; switch (req->opcode) { case IORING_OP_NOP: - ret = io_nop(req, cs); + ret = io_nop(req, issue_flags); break; case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: - ret = io_read(req, force_nonblock, cs); + ret = io_read(req, issue_flags); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - ret = io_write(req, force_nonblock, cs); + ret = io_write(req, issue_flags); break; case IORING_OP_FSYNC: - ret = io_fsync(req, force_nonblock); + ret = io_fsync(req, issue_flags); break; case IORING_OP_POLL_ADD: - ret = io_poll_add(req); + ret = io_poll_add(req, issue_flags); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req); + ret = io_poll_remove(req, issue_flags); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, force_nonblock); + ret = io_sync_file_range(req, issue_flags); break; case IORING_OP_SENDMSG: - ret = io_sendmsg(req, force_nonblock, cs); + ret = io_sendmsg(req, issue_flags); break; case IORING_OP_SEND: - ret = io_send(req, force_nonblock, cs); + ret = io_send(req, issue_flags); break; case IORING_OP_RECVMSG: - ret = io_recvmsg(req, force_nonblock, cs); + ret = io_recvmsg(req, issue_flags); break; case IORING_OP_RECV: - ret = io_recv(req, force_nonblock, cs); + ret = io_recv(req, issue_flags); break; case IORING_OP_TIMEOUT: - ret = io_timeout(req); + ret = io_timeout(req, issue_flags); break; case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req); + ret = io_timeout_remove(req, issue_flags); break; case IORING_OP_ACCEPT: - ret = io_accept(req, force_nonblock, cs); + ret = io_accept(req, issue_flags); break; case IORING_OP_CONNECT: - ret = io_connect(req, force_nonblock, cs); + ret = io_connect(req, issue_flags); break; case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req); + ret = io_async_cancel(req, issue_flags); break; case IORING_OP_FALLOCATE: - ret = io_fallocate(req, force_nonblock); + ret = io_fallocate(req, issue_flags); break; case IORING_OP_OPENAT: - ret = io_openat(req, force_nonblock); + ret = io_openat(req, issue_flags); break; case IORING_OP_CLOSE: - ret = io_close(req, force_nonblock, cs); + ret = io_close(req, issue_flags); break; case IORING_OP_FILES_UPDATE: - ret = io_files_update(req, force_nonblock, cs); + ret = io_files_update(req, issue_flags); break; case IORING_OP_STATX: - ret = io_statx(req, force_nonblock); + ret = io_statx(req, issue_flags); break; case IORING_OP_FADVISE: - ret = io_fadvise(req, force_nonblock); + ret = io_fadvise(req, issue_flags); break; case IORING_OP_MADVISE: - ret = io_madvise(req, force_nonblock); + ret = io_madvise(req, issue_flags); break; case IORING_OP_OPENAT2: - ret = io_openat2(req, force_nonblock); + ret = io_openat2(req, issue_flags); break; case IORING_OP_EPOLL_CTL: - ret = io_epoll_ctl(req, force_nonblock, cs); + ret = io_epoll_ctl(req, issue_flags); break; case IORING_OP_SPLICE: - ret = io_splice(req, force_nonblock); + ret = io_splice(req, issue_flags); break; case IORING_OP_PROVIDE_BUFFERS: - ret = io_provide_buffers(req, force_nonblock, cs); + ret = io_provide_buffers(req, issue_flags); break; case IORING_OP_REMOVE_BUFFERS: - ret = io_remove_buffers(req, force_nonblock, cs); + ret = io_remove_buffers(req, issue_flags); break; case IORING_OP_TEE: - ret = io_tee(req, force_nonblock); + ret = io_tee(req, issue_flags); break; case IORING_OP_SHUTDOWN: - ret = io_shutdown(req, force_nonblock); + ret = io_shutdown(req, issue_flags); break; case IORING_OP_RENAMEAT: - ret = io_renameat(req, force_nonblock); + ret = io_renameat(req, issue_flags); break; case IORING_OP_UNLINKAT: - ret = io_unlinkat(req, force_nonblock); + ret = io_unlinkat(req, issue_flags); break; default: ret = -EINVAL; @@ -6381,7 +6400,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, return 0; } -static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) +static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *timeout; @@ -6391,15 +6410,12 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) if (timeout) io_queue_linked_timeout(timeout); - /* if NO_CANCEL is set, we must still run the work */ - if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == - IO_WQ_WORK_CANCEL) { + if (work->flags & IO_WQ_WORK_CANCEL) ret = -ECANCELED; - } if (!ret) { do { - ret = io_issue_sqe(req, false, NULL); + ret = io_issue_sqe(req, 0); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -6435,14 +6451,12 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) if (lock_ctx) mutex_unlock(&lock_ctx->uring_lock); } - - return io_steal_work(req); } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, int index) { - struct fixed_file_table *table; + struct fixed_rsrc_table *table; table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT]; return table->files[index & IORING_FILE_TABLE_MASK]; @@ -6465,16 +6479,8 @@ static struct file *io_file_get(struct io_submit_state *state, file = __io_file_get(state, fd); } - if (file && file->f_op == &io_uring_fops && - !(req->flags & REQ_F_INFLIGHT)) { - io_req_init_async(req); - req->flags |= REQ_F_INFLIGHT; - - spin_lock_irq(&ctx->inflight_lock); - list_add(&req->inflight_entry, &ctx->inflight_list); - spin_unlock_irq(&ctx->inflight_lock); - } - + if (file && unlikely(file->f_op == &io_uring_fops)) + io_req_track_inflight(req); return file; } @@ -6503,9 +6509,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (prev) { req_set_fail_links(prev); io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); - io_put_req(prev); + io_put_req_deferred(prev, 1); } else { - io_req_complete(req, -ETIME); + io_req_complete_post(req, -ETIME, 0); + io_put_req_deferred(req, 1); } return HRTIMER_NORESTART; } @@ -6551,27 +6558,21 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs) +static void __io_queue_sqe(struct io_kiocb *req) { - struct io_kiocb *linked_timeout; + struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); const struct cred *old_creds = NULL; int ret; -again: - linked_timeout = io_prep_linked_timeout(req); - if ((req->flags & REQ_F_WORK_INITIALIZED) && (req->work.flags & IO_WQ_WORK_CREDS) && - req->work.identity->creds != current_cred()) { - if (old_creds) - revert_creds(old_creds); - if (old_creds == req->work.identity->creds) - old_creds = NULL; /* restored original creds */ - else - old_creds = override_creds(req->work.identity->creds); - } + req->work.identity->creds != current_cred()) + old_creds = override_creds(req->work.identity->creds); - ret = io_issue_sqe(req, true, cs); + ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + + if (old_creds) + revert_creds(old_creds); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -6585,34 +6586,28 @@ again: */ io_queue_async_work(req); } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } else if (likely(!ret)) { /* drop submission reference */ - req = io_put_req_find_next(req); - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); + if (req->flags & REQ_F_COMPLETE_INLINE) { + struct io_ring_ctx *ctx = req->ctx; + struct io_comp_state *cs = &ctx->submit_state.comp; - if (req) { - if (!(req->flags & REQ_F_FORCE_ASYNC)) - goto again; - io_queue_async_work(req); + cs->reqs[cs->nr++] = req; + if (cs->nr == ARRAY_SIZE(cs->reqs)) + io_submit_flush_completions(cs, ctx); + } else { + io_put_req(req); } } else { - /* un-prep timeout, so it'll be killed as any other linked */ - req->flags &= ~REQ_F_LINK_TIMEOUT; req_set_fail_links(req); io_put_req(req); io_req_complete(req, ret); } - - if (old_creds) - revert_creds(old_creds); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); } -static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_comp_state *cs) +static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; @@ -6637,18 +6632,17 @@ fail_req: if (unlikely(ret)) goto fail_req; } - __io_queue_sqe(req, cs); + __io_queue_sqe(req); } } -static inline void io_queue_link_head(struct io_kiocb *req, - struct io_comp_state *cs) +static inline void io_queue_link_head(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_FAIL_LINK)) { io_put_req(req); io_req_complete(req, -ECANCELED); } else - io_queue_sqe(req, NULL, cs); + io_queue_sqe(req, NULL); } struct io_submit_link { @@ -6657,7 +6651,7 @@ struct io_submit_link { }; static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_submit_link *link, struct io_comp_state *cs) + struct io_submit_link *link) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -6695,7 +6689,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - io_queue_link_head(head, cs); + io_queue_link_head(head); link->head = NULL; } } else { @@ -6710,7 +6704,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, link->head = req; link->last = req; } else { - io_queue_sqe(req, sqe, cs); + io_queue_sqe(req, sqe); } } @@ -6720,29 +6714,23 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* * Batched submission is done, ensure local IO is flushed out. */ -static void io_submit_state_end(struct io_submit_state *state) +static void io_submit_state_end(struct io_submit_state *state, + struct io_ring_ctx *ctx) { - if (!list_empty(&state->comp.list)) - io_submit_flush_completions(&state->comp); + if (state->comp.nr) + io_submit_flush_completions(&state->comp, ctx); if (state->plug_started) blk_finish_plug(&state->plug); io_state_file_put(state); - if (state->free_reqs) - kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); } /* * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - struct io_ring_ctx *ctx, unsigned int max_ios) + unsigned int max_ios) { state->plug_started = false; - state->comp.nr = 0; - INIT_LIST_HEAD(&state->comp.list); - state->comp.ctx = ctx; - state->free_reqs = 0; - state->file_refs = 0; state->ios_left = max_ios; } @@ -6779,7 +6767,7 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ - head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]); + head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]); if (likely(head < ctx->sq_entries)) return &ctx->sq_sqes[head]; @@ -6789,11 +6777,6 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) return NULL; } -static inline void io_consume_sqe(struct io_ring_ctx *ctx) -{ - ctx->cached_sq_head++; -} - /* * Check SQE restrictions (opcode and flags). * @@ -6825,36 +6808,36 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, IOSQE_BUFFER_SELECT) static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe, - struct io_submit_state *state) + const struct io_uring_sqe *sqe) { + struct io_submit_state *state; unsigned int sqe_flags; - int id, ret; + int id, ret = 0; req->opcode = READ_ONCE(sqe->opcode); + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags = sqe_flags = READ_ONCE(sqe->flags); req->user_data = READ_ONCE(sqe->user_data); req->async_data = NULL; req->file = NULL; req->ctx = ctx; - req->flags = 0; req->link = NULL; - req->fixed_file_refs = NULL; + req->fixed_rsrc_refs = NULL; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); req->task = current; req->result = 0; + /* enforce forwards compatibility on users */ + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) + return -EINVAL; + if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; if (unlikely(io_sq_thread_acquire_mm_files(ctx, req))) return -EFAULT; - sqe_flags = READ_ONCE(sqe->flags); - /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) - return -EINVAL; - if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) return -EACCES; @@ -6877,8 +6860,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_CREDS; } - /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags |= sqe_flags; + state = &ctx->submit_state; /* * Plug now if we have more than 1 IO left after this, and the target @@ -6890,13 +6872,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, state->plug_started = true; } - ret = 0; if (io_op_defs[req->opcode].needs_file) { bool fixed = req->flags & REQ_F_FIXED_FILE; req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); - if (unlikely(!req->file && - !io_op_defs[req->opcode].needs_file_no_error)) + if (unlikely(!req->file)) ret = -EBADF; } @@ -6906,7 +6886,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { - struct io_submit_state state; struct io_submit_link link; int i, submitted = 0; @@ -6925,7 +6904,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) percpu_counter_add(¤t->io_uring->inflight, nr); refcount_add(nr, ¤t->usage); - io_submit_state_start(&state, ctx, nr); + io_submit_state_start(&ctx->submit_state, nr); link.head = NULL; for (i = 0; i < nr; i++) { @@ -6933,22 +6912,21 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) struct io_kiocb *req; int err; - sqe = io_get_sqe(ctx); - if (unlikely(!sqe)) { - io_consume_sqe(ctx); - break; - } - req = io_alloc_req(ctx, &state); + req = io_alloc_req(ctx); if (unlikely(!req)) { if (!submitted) submitted = -EAGAIN; break; } - io_consume_sqe(ctx); + sqe = io_get_sqe(ctx); + if (unlikely(!sqe)) { + kmem_cache_free(req_cachep, req); + break; + } /* will complete beyond this point, count as submitted */ submitted++; - err = io_init_req(ctx, req, sqe, &state); + err = io_init_req(ctx, req, sqe); if (unlikely(err)) { fail_req: io_put_req(req); @@ -6957,8 +6935,8 @@ fail_req: } trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, - true, io_async_submit(ctx)); - err = io_submit_sqe(req, sqe, &link, &state.comp); + true, ctx->flags & IORING_SETUP_SQPOLL); + err = io_submit_sqe(req, sqe, &link); if (err) goto fail_req; } @@ -6973,8 +6951,8 @@ fail_req: put_task_struct_many(current, unused); } if (link.head) - io_queue_link_head(link.head, &state.comp); - io_submit_state_end(&state); + io_queue_link_head(link.head); + io_submit_state_end(&ctx->submit_state, ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); @@ -7123,9 +7101,6 @@ static int io_sq_thread(void *data) continue; } - if (kthread_should_park()) - continue; - needs_sched = true; prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { @@ -7140,7 +7115,7 @@ static int io_sq_thread(void *data) } } - if (needs_sched) { + if (needs_sched && !kthread_should_park()) { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); @@ -7217,6 +7192,25 @@ static int io_run_task_work_sig(void) return -EINTR; } +/* when returns >0, the caller should retry */ +static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + signed long *timeout) +{ + int ret; + + /* make sure we run task_work before checking for signals */ + ret = io_run_task_work_sig(); + if (ret || io_should_wake(iowq)) + return ret; + /* let the caller flush overflows, retry */ + if (test_bit(0, &ctx->cq_check_overflow)) + return 1; + + *timeout = schedule_timeout(*timeout); + return !*timeout ? -ETIME : 1; +} + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -7235,9 +7229,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, .to_wait = min_events, }; struct io_rings *rings = ctx->rings; - struct timespec64 ts; - signed long timeout = 0; - int ret = 0; + signed long timeout = MAX_SCHEDULE_TIMEOUT; + int ret; do { io_cqring_overflow_flush(ctx, false, NULL, NULL); @@ -7261,6 +7254,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, } if (uts) { + struct timespec64 ts; + if (get_timespec64(&ts, uts)) return -EFAULT; timeout = timespec64_to_jiffies(&ts); @@ -7272,31 +7267,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, io_cqring_overflow_flush(ctx, false, NULL, NULL); prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); - /* make sure we run task_work before checking for signals */ - ret = io_run_task_work_sig(); - if (ret > 0) { - finish_wait(&ctx->wait, &iowq.wq); - continue; - } - else if (ret < 0) - break; - if (io_should_wake(&iowq)) - break; - if (test_bit(0, &ctx->cq_check_overflow)) { - finish_wait(&ctx->wait, &iowq.wq); - continue; - } - if (uts) { - timeout = schedule_timeout(timeout); - if (timeout == 0) { - ret = -ETIME; - break; - } - } else { - schedule(); - } - } while (1); - finish_wait(&ctx->wait, &iowq.wq); + ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); + finish_wait(&ctx->wait, &iowq.wq); + } while (ret > 0); restore_saved_sigmask_unless(ret == -EINTR); @@ -7326,47 +7299,52 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) #endif } -static void io_file_ref_kill(struct percpu_ref *ref) +static void io_rsrc_data_ref_zero(struct percpu_ref *ref) { - struct fixed_file_data *data; + struct fixed_rsrc_data *data; - data = container_of(ref, struct fixed_file_data, refs); + data = container_of(ref, struct fixed_rsrc_data, refs); complete(&data->done); } -static void io_sqe_files_set_node(struct fixed_file_data *file_data, - struct fixed_file_ref_node *ref_node) +static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx) { - spin_lock_bh(&file_data->lock); - file_data->node = ref_node; - list_add_tail(&ref_node->node, &file_data->ref_list); - spin_unlock_bh(&file_data->lock); - percpu_ref_get(&file_data->refs); + spin_lock_bh(&ctx->rsrc_ref_lock); } -static int io_sqe_files_unregister(struct io_ring_ctx *ctx) +static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx) { - struct fixed_file_data *data = ctx->file_data; - struct fixed_file_ref_node *backup_node, *ref_node = NULL; - unsigned nr_tables, i; - int ret; + spin_unlock_bh(&ctx->rsrc_ref_lock); +} - if (!data) - return -ENXIO; - backup_node = alloc_fixed_file_ref_node(ctx); - if (!backup_node) - return -ENOMEM; +static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx, + struct fixed_rsrc_data *rsrc_data, + struct fixed_rsrc_ref_node *ref_node) +{ + io_rsrc_ref_lock(ctx); + rsrc_data->node = ref_node; + list_add_tail(&ref_node->node, &ctx->rsrc_ref_list); + io_rsrc_ref_unlock(ctx); + percpu_ref_get(&rsrc_data->refs); +} + +static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data, + struct io_ring_ctx *ctx, + struct fixed_rsrc_ref_node *backup_node) +{ + struct fixed_rsrc_ref_node *ref_node; + int ret; - spin_lock_bh(&data->lock); + io_rsrc_ref_lock(ctx); ref_node = data->node; - spin_unlock_bh(&data->lock); + io_rsrc_ref_unlock(ctx); if (ref_node) percpu_ref_kill(&ref_node->refs); percpu_ref_kill(&data->refs); /* wait for all refs nodes to complete */ - flush_delayed_work(&ctx->file_put_work); + flush_delayed_work(&ctx->rsrc_put_work); do { ret = wait_for_completion_interruptible(&data->done); if (!ret) @@ -7375,21 +7353,65 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) if (ret < 0) { percpu_ref_resurrect(&data->refs); reinit_completion(&data->done); - io_sqe_files_set_node(data, backup_node); + io_sqe_rsrc_set_node(ctx, data, backup_node); return ret; } } while (1); + destroy_fixed_rsrc_ref_node(backup_node); + return 0; +} + +static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx) +{ + struct fixed_rsrc_data *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + + if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + kfree(data); + return NULL; + } + data->ctx = ctx; + init_completion(&data->done); + return data; +} + +static void free_fixed_rsrc_data(struct fixed_rsrc_data *data) +{ + percpu_ref_exit(&data->refs); + kfree(data->table); + kfree(data); +} + +static int io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ + struct fixed_rsrc_data *data = ctx->file_data; + struct fixed_rsrc_ref_node *backup_node; + unsigned nr_tables, i; + int ret; + + if (!data) + return -ENXIO; + backup_node = alloc_fixed_rsrc_ref_node(ctx); + if (!backup_node) + return -ENOMEM; + init_fixed_file_ref_node(ctx, backup_node); + + ret = io_rsrc_ref_quiesce(data, ctx, backup_node); + if (ret) + return ret; + __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) kfree(data->table[i].files); - kfree(data->table); - percpu_ref_exit(&data->refs); - kfree(data); + free_fixed_rsrc_data(data); ctx->file_data = NULL; ctx->nr_user_files = 0; - destroy_fixed_file_ref_node(backup_node); return 0; } @@ -7612,13 +7634,13 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) } #endif -static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data, +static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data, unsigned nr_tables, unsigned nr_files) { int i; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &file_data->table[i]; + struct fixed_rsrc_table *table = &file_data->table[i]; unsigned this_files; this_files = min(nr_files, IORING_MAX_FILES_TABLE); @@ -7633,14 +7655,15 @@ static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data, return 0; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &file_data->table[i]; + struct fixed_rsrc_table *table = &file_data->table[i]; kfree(table->files); } return 1; } -static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) +static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) { + struct file *file = prsrc->file; #if defined(CONFIG_UNIX) struct sock *sock = ctx->ring_sock->sk; struct sk_buff_head list, *head = &sock->sk_receive_queue; @@ -7701,108 +7724,119 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) #endif } -struct io_file_put { - struct list_head list; - struct file *file; -}; - -static void __io_file_put_work(struct fixed_file_ref_node *ref_node) +static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node) { - struct fixed_file_data *file_data = ref_node->file_data; - struct io_ring_ctx *ctx = file_data->ctx; - struct io_file_put *pfile, *tmp; + struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data; + struct io_ring_ctx *ctx = rsrc_data->ctx; + struct io_rsrc_put *prsrc, *tmp; - list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) { - list_del(&pfile->list); - io_ring_file_put(ctx, pfile->file); - kfree(pfile); + list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { + list_del(&prsrc->list); + ref_node->rsrc_put(ctx, prsrc); + kfree(prsrc); } percpu_ref_exit(&ref_node->refs); kfree(ref_node); - percpu_ref_put(&file_data->refs); + percpu_ref_put(&rsrc_data->refs); } -static void io_file_put_work(struct work_struct *work) +static void io_rsrc_put_work(struct work_struct *work) { struct io_ring_ctx *ctx; struct llist_node *node; - ctx = container_of(work, struct io_ring_ctx, file_put_work.work); - node = llist_del_all(&ctx->file_put_llist); + ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); + node = llist_del_all(&ctx->rsrc_put_llist); while (node) { - struct fixed_file_ref_node *ref_node; + struct fixed_rsrc_ref_node *ref_node; struct llist_node *next = node->next; - ref_node = llist_entry(node, struct fixed_file_ref_node, llist); - __io_file_put_work(ref_node); + ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist); + __io_rsrc_put_work(ref_node); node = next; } } -static void io_file_data_ref_zero(struct percpu_ref *ref) +static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data, + unsigned i) +{ + struct fixed_rsrc_table *table; + + table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT]; + return &table->files[i & IORING_FILE_TABLE_MASK]; +} + +static void io_rsrc_node_ref_zero(struct percpu_ref *ref) { - struct fixed_file_ref_node *ref_node; - struct fixed_file_data *data; + struct fixed_rsrc_ref_node *ref_node; + struct fixed_rsrc_data *data; struct io_ring_ctx *ctx; bool first_add = false; int delay = HZ; - ref_node = container_of(ref, struct fixed_file_ref_node, refs); - data = ref_node->file_data; + ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs); + data = ref_node->rsrc_data; ctx = data->ctx; - spin_lock_bh(&data->lock); + io_rsrc_ref_lock(ctx); ref_node->done = true; - while (!list_empty(&data->ref_list)) { - ref_node = list_first_entry(&data->ref_list, - struct fixed_file_ref_node, node); + while (!list_empty(&ctx->rsrc_ref_list)) { + ref_node = list_first_entry(&ctx->rsrc_ref_list, + struct fixed_rsrc_ref_node, node); /* recycle ref nodes in order */ if (!ref_node->done) break; list_del(&ref_node->node); - first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); + first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist); } - spin_unlock_bh(&data->lock); + io_rsrc_ref_unlock(ctx); if (percpu_ref_is_dying(&data->refs)) delay = 0; if (!delay) - mod_delayed_work(system_wq, &ctx->file_put_work, 0); + mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0); else if (first_add) - queue_delayed_work(system_wq, &ctx->file_put_work, delay); + queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay); } -static struct fixed_file_ref_node *alloc_fixed_file_ref_node( +static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node( struct io_ring_ctx *ctx) { - struct fixed_file_ref_node *ref_node; + struct fixed_rsrc_ref_node *ref_node; ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); if (!ref_node) return NULL; - if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero, + if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, 0, GFP_KERNEL)) { kfree(ref_node); return NULL; } INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->file_list); - ref_node->file_data = ctx->file_data; + INIT_LIST_HEAD(&ref_node->rsrc_list); ref_node->done = false; return ref_node; } -static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node) +static void init_fixed_file_ref_node(struct io_ring_ctx *ctx, + struct fixed_rsrc_ref_node *ref_node) +{ + ref_node->rsrc_data = ctx->file_data; + ref_node->rsrc_put = io_ring_file_put; +} + +static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node) { percpu_ref_exit(&ref_node->refs); kfree(ref_node); } + static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { @@ -7810,8 +7844,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_tables, i; struct file *file; int fd, ret = -ENOMEM; - struct fixed_file_ref_node *ref_node; - struct fixed_file_data *file_data; + struct fixed_rsrc_ref_node *ref_node; + struct fixed_rsrc_data *file_data; if (ctx->file_data) return -EBUSY; @@ -7820,13 +7854,10 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; - file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); + file_data = alloc_fixed_rsrc_data(ctx); if (!file_data) return -ENOMEM; - file_data->ctx = ctx; - init_completion(&file_data->done); - INIT_LIST_HEAD(&file_data->ref_list); - spin_lock_init(&file_data->lock); + ctx->file_data = file_data; nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); file_data->table = kcalloc(nr_tables, sizeof(*file_data->table), @@ -7834,18 +7865,10 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (!file_data->table) goto out_free; - if (percpu_ref_init(&file_data->refs, io_file_ref_kill, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) - goto out_free; - if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args)) - goto out_ref; - ctx->file_data = file_data; + goto out_free; for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - struct fixed_file_table *table; - unsigned index; - if (copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; goto out_fput; @@ -7870,9 +7893,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, fput(file); goto out_fput; } - table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT]; - index = i & IORING_FILE_TABLE_MASK; - table->files[index] = file; + *io_fixed_file_slot(file_data, i) = file; } ret = io_sqe_files_scm(ctx); @@ -7881,13 +7902,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } - ref_node = alloc_fixed_file_ref_node(ctx); + ref_node = alloc_fixed_rsrc_ref_node(ctx); if (!ref_node) { io_sqe_files_unregister(ctx); return -ENOMEM; } + init_fixed_file_ref_node(ctx, ref_node); - io_sqe_files_set_node(file_data, ref_node); + io_sqe_rsrc_set_node(ctx, file_data, ref_node); return ret; out_fput: for (i = 0; i < ctx->nr_user_files; i++) { @@ -7898,11 +7920,8 @@ out_fput: for (i = 0; i < nr_tables; i++) kfree(file_data->table[i].files); ctx->nr_user_files = 0; -out_ref: - percpu_ref_exit(&file_data->refs); out_free: - kfree(file_data->table); - kfree(file_data); + free_fixed_rsrc_data(ctx->file_data); ctx->file_data = NULL; return ret; } @@ -7950,29 +7969,34 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static int io_queue_file_removal(struct fixed_file_data *data, - struct file *file) +static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc) { - struct io_file_put *pfile; - struct fixed_file_ref_node *ref_node = data->node; + struct io_rsrc_put *prsrc; + struct fixed_rsrc_ref_node *ref_node = data->node; - pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); - if (!pfile) + prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); + if (!prsrc) return -ENOMEM; - pfile->file = file; - list_add(&pfile->list, &ref_node->file_list); + prsrc->rsrc = rsrc; + list_add(&prsrc->list, &ref_node->rsrc_list); return 0; } +static inline int io_queue_file_removal(struct fixed_rsrc_data *data, + struct file *file) +{ + return io_queue_rsrc_removal(data, (void *)file); +} + static int __io_sqe_files_update(struct io_ring_ctx *ctx, - struct io_uring_files_update *up, + struct io_uring_rsrc_update *up, unsigned nr_args) { - struct fixed_file_data *data = ctx->file_data; - struct fixed_file_ref_node *ref_node; - struct file *file; + struct fixed_rsrc_data *data = ctx->file_data; + struct fixed_rsrc_ref_node *ref_node; + struct file *file, **file_slot; __s32 __user *fds; int fd, i, err; __u32 done; @@ -7983,30 +8007,29 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (done > ctx->nr_user_files) return -EINVAL; - ref_node = alloc_fixed_file_ref_node(ctx); + ref_node = alloc_fixed_rsrc_ref_node(ctx); if (!ref_node) return -ENOMEM; + init_fixed_file_ref_node(ctx, ref_node); - done = 0; - fds = u64_to_user_ptr(up->fds); - while (nr_args) { - struct fixed_file_table *table; - unsigned index; - + fds = u64_to_user_ptr(up->data); + for (done = 0; done < nr_args; done++) { err = 0; if (copy_from_user(&fd, &fds[done], sizeof(fd))) { err = -EFAULT; break; } - i = array_index_nospec(up->offset, ctx->nr_user_files); - table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; - index = i & IORING_FILE_TABLE_MASK; - if (table->files[index]) { - file = table->files[index]; - err = io_queue_file_removal(data, file); + if (fd == IORING_REGISTER_FILES_SKIP) + continue; + + i = array_index_nospec(up->offset + done, ctx->nr_user_files); + file_slot = io_fixed_file_slot(ctx->file_data, i); + + if (*file_slot) { + err = io_queue_file_removal(data, *file_slot); if (err) break; - table->files[index] = NULL; + *file_slot = NULL; needs_switch = true; } if (fd != -1) { @@ -8028,24 +8051,21 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - table->files[index] = file; + *file_slot = file; err = io_sqe_file_register(ctx, file, i); if (err) { - table->files[index] = NULL; + *file_slot = NULL; fput(file); break; } } - nr_args--; - done++; - up->offset++; } if (needs_switch) { percpu_ref_kill(&data->node->refs); - io_sqe_files_set_node(data, ref_node); + io_sqe_rsrc_set_node(ctx, data, ref_node); } else - destroy_fixed_file_ref_node(ref_node); + destroy_fixed_rsrc_ref_node(ref_node); return done ? done : err; } @@ -8053,7 +8073,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { - struct io_uring_files_update up; + struct io_uring_rsrc_update up; if (!ctx->file_data) return -ENXIO; @@ -8067,12 +8087,12 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, return __io_sqe_files_update(ctx, &up, nr_args); } -static void io_free_work(struct io_wq_work *work) +static struct io_wq_work *io_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); - /* Consider that io_steal_work() relies on this ref */ - io_put_req(req); + req = io_put_req_find_next(req); + return req ? &req->work : NULL; } static int io_init_wq_offload(struct io_ring_ctx *ctx, @@ -8145,6 +8165,10 @@ static int io_uring_alloc_task_context(struct task_struct *task) io_init_identity(&tctx->__identity); tctx->identity = &tctx->__identity; task->io_uring = tctx; + spin_lock_init(&tctx->task_lock); + INIT_WQ_LIST(&tctx->task_list); + tctx->task_state = 0; + init_task_work(&tctx->task_work, tctx_task_work); return 0; } @@ -8266,25 +8290,16 @@ static inline int __io_account_mem(struct user_struct *user, return 0; } -static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, - enum io_mem_account acct) +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); - if (ctx->mm_account) { - if (acct == ACCT_LOCKED) { - mmap_write_lock(ctx->mm_account); - ctx->mm_account->locked_vm -= nr_pages; - mmap_write_unlock(ctx->mm_account); - }else if (acct == ACCT_PINNED) { - atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); - } - } + if (ctx->mm_account) + atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); } -static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, - enum io_mem_account acct) +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { int ret; @@ -8294,15 +8309,8 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, return ret; } - if (ctx->mm_account) { - if (acct == ACCT_LOCKED) { - mmap_write_lock(ctx->mm_account); - ctx->mm_account->locked_vm += nr_pages; - mmap_write_unlock(ctx->mm_account); - } else if (acct == ACCT_PINNED) { - atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); - } - } + if (ctx->mm_account) + atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); return 0; } @@ -8322,7 +8330,7 @@ static void io_mem_free(void *ptr) static void *io_mem_alloc(size_t size) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | - __GFP_NORETRY; + __GFP_NORETRY | __GFP_ACCOUNT; return (void *) __get_free_pages(gfp_flags, get_order(size)); } @@ -8356,19 +8364,7 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, return off; } -static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) -{ - size_t pages; - - pages = (size_t)1 << get_order( - rings_size(sq_entries, cq_entries, NULL)); - pages += (size_t)1 << get_order( - array_size(sizeof(struct io_uring_sqe), sq_entries)); - - return pages; -} - -static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) +static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { int i, j; @@ -8382,7 +8378,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) unpin_user_page(imu->bvec[j].bv_page); if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED); + io_unaccount_mem(ctx, imu->acct_pages); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -8480,21 +8476,105 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, if (!imu->acct_pages) return 0; - ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED); + ret = io_account_mem(ctx, imu->acct_pages); if (ret) imu->acct_pages = 0; return ret; } -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, + struct io_mapped_ubuf *imu, + struct page **last_hpage) { struct vm_area_struct **vmas = NULL; struct page **pages = NULL; - struct page *last_hpage = NULL; - int i, j, got_pages = 0; - int ret = -EINVAL; + unsigned long off, start, end, ubuf; + size_t size; + int ret, pret, nr_pages, i; + + ubuf = (unsigned long) iov->iov_base; + end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ubuf >> PAGE_SHIFT; + nr_pages = end - start; + + ret = -ENOMEM; + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto done; + + vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas) + goto done; + + imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), + GFP_KERNEL); + if (!imu->bvec) + goto done; + + ret = 0; + mmap_read_lock(current->mm); + pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages, vmas); + if (pret == nr_pages) { + /* don't support file backed memory */ + for (i = 0; i < nr_pages; i++) { + struct vm_area_struct *vma = vmas[i]; + + if (vma->vm_file && + !is_file_hugepages(vma->vm_file)) { + ret = -EOPNOTSUPP; + break; + } + } + } else { + ret = pret < 0 ? pret : -EFAULT; + } + mmap_read_unlock(current->mm); + if (ret) { + /* + * if we did partial map, or found file backed vmas, + * release any pages we did get + */ + if (pret > 0) + unpin_user_pages(pages, pret); + kvfree(imu->bvec); + goto done; + } + + ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); + if (ret) { + unpin_user_pages(pages, pret); + kvfree(imu->bvec); + goto done; + } + off = ubuf & ~PAGE_MASK; + size = iov->iov_len; + for (i = 0; i < nr_pages; i++) { + size_t vec_len; + + vec_len = min_t(size_t, size, PAGE_SIZE - off); + imu->bvec[i].bv_page = pages[i]; + imu->bvec[i].bv_len = vec_len; + imu->bvec[i].bv_offset = off; + off = 0; + size -= vec_len; + } + /* store original address for later verification */ + imu->ubuf = ubuf; + imu->len = iov->iov_len; + imu->nr_bvecs = nr_pages; + ret = 0; +done: + kvfree(pages); + kvfree(vmas); + return ret; +} + +static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) +{ if (ctx->user_bufs) return -EBUSY; if (!nr_args || nr_args > UIO_MAXIOV) @@ -8505,121 +8585,58 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, if (!ctx->user_bufs) return -ENOMEM; - for (i = 0; i < nr_args; i++) { - struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; - unsigned long off, start, end, ubuf; - int pret, nr_pages; - struct iovec iov; - size_t size; + return 0; +} - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) - goto err; +static int io_buffer_validate(struct iovec *iov) +{ + /* + * Don't impose further limits on the size and buffer + * constraints here, we'll -EINVAL later when IO is + * submitted if they are wrong. + */ + if (!iov->iov_base || !iov->iov_len) + return -EFAULT; - /* - * Don't impose further limits on the size and buffer - * constraints here, we'll -EINVAL later when IO is - * submitted if they are wrong. - */ - ret = -EFAULT; - if (!iov.iov_base || !iov.iov_len) - goto err; + /* arbitrary limit, but we need something */ + if (iov->iov_len > SZ_1G) + return -EFAULT; - /* arbitrary limit, but we need something */ - if (iov.iov_len > SZ_1G) - goto err; + return 0; +} - ubuf = (unsigned long) iov.iov_base; - end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ubuf >> PAGE_SHIFT; - nr_pages = end - start; +static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int nr_args) +{ + int i, ret; + struct iovec iov; + struct page *last_hpage = NULL; - ret = 0; - if (!pages || nr_pages > got_pages) { - kvfree(vmas); - kvfree(pages); - pages = kvmalloc_array(nr_pages, sizeof(struct page *), - GFP_KERNEL); - vmas = kvmalloc_array(nr_pages, - sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!pages || !vmas) { - ret = -ENOMEM; - goto err; - } - got_pages = nr_pages; - } + ret = io_buffers_map_alloc(ctx, nr_args); + if (ret) + return ret; - imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), - GFP_KERNEL); - ret = -ENOMEM; - if (!imu->bvec) - goto err; + for (i = 0; i < nr_args; i++) { + struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; - ret = 0; - mmap_read_lock(current->mm); - pret = pin_user_pages(ubuf, nr_pages, - FOLL_WRITE | FOLL_LONGTERM, - pages, vmas); - if (pret == nr_pages) { - /* don't support file backed memory */ - for (j = 0; j < nr_pages; j++) { - struct vm_area_struct *vma = vmas[j]; - - if (vma->vm_file && - !is_file_hugepages(vma->vm_file)) { - ret = -EOPNOTSUPP; - break; - } - } - } else { - ret = pret < 0 ? pret : -EFAULT; - } - mmap_read_unlock(current->mm); - if (ret) { - /* - * if we did partial map, or found file backed vmas, - * release any pages we did get - */ - if (pret > 0) - unpin_user_pages(pages, pret); - kvfree(imu->bvec); - goto err; - } + ret = io_copy_iov(ctx, &iov, arg, i); + if (ret) + break; - ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage); - if (ret) { - unpin_user_pages(pages, pret); - kvfree(imu->bvec); - goto err; - } + ret = io_buffer_validate(&iov); + if (ret) + break; - off = ubuf & ~PAGE_MASK; - size = iov.iov_len; - for (j = 0; j < nr_pages; j++) { - size_t vec_len; - - vec_len = min_t(size_t, size, PAGE_SIZE - off); - imu->bvec[j].bv_page = pages[j]; - imu->bvec[j].bv_len = vec_len; - imu->bvec[j].bv_offset = off; - off = 0; - size -= vec_len; - } - /* store original address for later verification */ - imu->ubuf = ubuf; - imu->len = iov.iov_len; - imu->nr_bvecs = nr_pages; + ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage); + if (ret) + break; ctx->nr_user_bufs++; } - kvfree(pages); - kvfree(vmas); - return 0; -err: - kvfree(pages); - kvfree(vmas); - io_sqe_buffer_unregister(ctx); + + if (ret) + io_sqe_buffers_unregister(ctx); + return ret; } @@ -8670,10 +8687,49 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) idr_destroy(&ctx->io_buffer_idr); } +static void io_req_cache_free(struct list_head *list, struct task_struct *tsk) +{ + struct io_kiocb *req, *nxt; + + list_for_each_entry_safe(req, nxt, list, compl.list) { + if (tsk && req->task != tsk) + continue; + list_del(&req->compl.list); + kmem_cache_free(req_cachep, req); + } +} + +static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk) +{ + struct io_submit_state *submit_state = &ctx->submit_state; + + mutex_lock(&ctx->uring_lock); + + if (submit_state->free_reqs) + kmem_cache_free_bulk(req_cachep, submit_state->free_reqs, + submit_state->reqs); + + io_req_cache_free(&submit_state->comp.free_list, NULL); + + spin_lock_irq(&ctx->completion_lock); + io_req_cache_free(&submit_state->comp.locked_free_list, NULL); + spin_unlock_irq(&ctx->completion_lock); + + mutex_unlock(&ctx->uring_lock); +} + static void io_ring_ctx_free(struct io_ring_ctx *ctx) { + /* + * Some may use context even when all refs and requests have been put, + * and they are free to do so while still holding uring_lock, see + * __io_req_task_submit(). Wait for them to finish. + */ + mutex_lock(&ctx->uring_lock); + mutex_unlock(&ctx->uring_lock); + io_finish_async(ctx); - io_sqe_buffer_unregister(ctx); + io_sqe_buffers_unregister(ctx); if (ctx->sqo_task) { put_task_struct(ctx->sqo_task); @@ -8705,8 +8761,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) percpu_ref_exit(&ctx->refs); free_uid(ctx->user); put_cred(ctx->creds); + io_req_caches_free(ctx, NULL); kfree(ctx->cancel_hash); - kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); } @@ -8723,8 +8779,21 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) smp_rmb(); if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; - io_cqring_overflow_flush(ctx, false, NULL, NULL); - if (io_cqring_events(ctx)) + + /* + * Don't flush cqring overflow list here, just do a simple check. + * Otherwise there could possible be ABBA deadlock: + * CPU0 CPU1 + * ---- ---- + * lock(&ctx->uring_lock); + * lock(&ep->mtx); + * lock(&ctx->uring_lock); + * lock(&ep->mtx); + * + * Users may get EPOLLIN meanwhile seeing nothing in cqring, this + * pushs them to do the flush. + */ + if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow)) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -8737,9 +8806,8 @@ static int io_uring_fasync(int fd, struct file *file, int on) return fasync_helper(fd, file, on, &ctx->cq_fasync); } -static int io_remove_personalities(int id, void *p, void *data) +static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { - struct io_ring_ctx *ctx = data; struct io_identity *iod; iod = idr_remove(&ctx->personality_idr, id); @@ -8747,7 +8815,17 @@ static int io_remove_personalities(int id, void *p, void *data) put_cred(iod->creds); if (refcount_dec_and_test(&iod->count)) kfree(iod); + return 0; } + + return -EINVAL; +} + +static int io_remove_personalities(int id, void *p, void *data) +{ + struct io_ring_ctx *ctx = data; + + io_unregister_personality(ctx, id); return 0; } @@ -8763,7 +8841,7 @@ static void io_ring_exit_work(struct work_struct *work) * as nobody else will be looking for them. */ do { - __io_uring_cancel_task_requests(ctx, NULL); + io_uring_try_cancel_requests(ctx, NULL, NULL); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); } @@ -8787,6 +8865,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) ctx->cq_overflow_flushed = 1; if (ctx->rings) __io_cqring_overflow_flush(ctx, true, NULL, NULL); + idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); mutex_unlock(&ctx->uring_lock); io_kill_timeouts(ctx, NULL, NULL); @@ -8797,15 +8876,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) /* if we failed setting up the ctx, we might not have any rings */ io_iopoll_try_reap_events(ctx); - idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); - - /* - * Do this upfront, so we won't have a grace period where the ring - * is closed but resources aren't reaped yet. This can cause - * spurious failure in setting up a new ring. - */ - io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries), - ACCT_LOCKED); INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* @@ -8877,6 +8947,40 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } +static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct task_struct *task, + struct files_struct *files) +{ + struct io_task_cancel cancel = { .task = task, .files = files, }; + + while (1) { + enum io_wq_cancel cret; + bool ret = false; + + if (ctx->io_wq) { + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, + &cancel, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } + + /* SQPOLL thread does its own polling */ + if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) { + while (!list_empty_careful(&ctx->iopoll_list)) { + io_iopoll_try_reap_events(ctx); + ret = true; + } + } + + ret |= io_poll_remove_all(ctx, task, files); + ret |= io_kill_timeouts(ctx, task, files); + ret |= io_run_task_work(); + io_cqring_overflow_flush(ctx, true, task, files); + if (!ret) + break; + cond_resched(); + } +} + static int io_uring_count_inflight(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) @@ -8896,7 +9000,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { while (!list_empty_careful(&ctx->inflight_list)) { - struct io_task_cancel cancel = { .task = task, .files = files }; DEFINE_WAIT(wait); int inflight; @@ -8904,49 +9007,17 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (!inflight) break; - io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); - io_poll_remove_all(ctx, task, files); - io_kill_timeouts(ctx, task, files); - io_cqring_overflow_flush(ctx, true, task, files); - /* cancellations _may_ trigger task work */ - io_run_task_work(); + io_uring_try_cancel_requests(ctx, task, files); + if (ctx->sq_data) + io_sq_thread_unpark(ctx->sq_data); prepare_to_wait(&task->io_uring->wait, &wait, TASK_UNINTERRUPTIBLE); if (inflight == io_uring_count_inflight(ctx, task, files)) schedule(); finish_wait(&task->io_uring->wait, &wait); - } -} - -static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, - struct task_struct *task) -{ - while (1) { - struct io_task_cancel cancel = { .task = task, .files = NULL, }; - enum io_wq_cancel cret; - bool ret = false; - - if (ctx->io_wq) { - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, - &cancel, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - - /* SQPOLL thread does its own polling */ - if (!(ctx->flags & IORING_SETUP_SQPOLL)) { - while (!list_empty_careful(&ctx->iopoll_list)) { - io_iopoll_try_reap_events(ctx); - ret = true; - } - } - - ret |= io_poll_remove_all(ctx, task, NULL); - ret |= io_kill_timeouts(ctx, task, NULL); - ret |= io_run_task_work(); - if (!ret) - break; - cond_resched(); + if (ctx->sq_data) + io_sq_thread_park(ctx->sq_data); } } @@ -8979,11 +9050,10 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, } io_cancel_defer_files(ctx, task, files); - io_cqring_overflow_flush(ctx, true, task, files); io_uring_cancel_files(ctx, task, files); if (!files) - __io_uring_cancel_task_requests(ctx, task); + io_uring_try_cancel_requests(ctx, task, NULL); if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { atomic_dec(&task->io_uring->in_idle); @@ -9016,6 +9086,10 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) fput(file); return ret; } + + /* one and only SQPOLL file note, held by sqo_task */ + WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && + current != ctx->sqo_task); } tctx->last = file; } @@ -9072,29 +9146,39 @@ void __io_uring_files_cancel(struct files_struct *files) static s64 tctx_inflight(struct io_uring_task *tctx) { - unsigned long index; - struct file *file; - s64 inflight; - - inflight = percpu_counter_sum(&tctx->inflight); - if (!tctx->sqpoll) - return inflight; + return percpu_counter_sum(&tctx->inflight); +} - /* - * If we have SQPOLL rings, then we need to iterate and find them, and - * add the pending count for those. - */ - xa_for_each(&tctx->xa, index, file) { - struct io_ring_ctx *ctx = file->private_data; +static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) +{ + struct io_uring_task *tctx; + s64 inflight; + DEFINE_WAIT(wait); - if (ctx->flags & IORING_SETUP_SQPOLL) { - struct io_uring_task *__tctx = ctx->sqo_task->io_uring; + if (!ctx->sq_data) + return; + tctx = ctx->sq_data->thread->io_uring; + io_disable_sqo_submit(ctx); - inflight += percpu_counter_sum(&__tctx->inflight); - } - } + atomic_inc(&tctx->in_idle); + do { + /* read completions before cancelations */ + inflight = tctx_inflight(tctx); + if (!inflight) + break; + io_uring_cancel_task_requests(ctx, NULL); - return inflight; + prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); + /* + * If we've seen completions, retry without waiting. This + * avoids a race where a completion comes in before we did + * prepare_to_wait(). + */ + if (inflight == tctx_inflight(tctx)) + schedule(); + finish_wait(&tctx->wait, &wait); + } while (1); + atomic_dec(&tctx->in_idle); } /* @@ -9111,8 +9195,13 @@ void __io_uring_task_cancel(void) atomic_inc(&tctx->in_idle); /* trigger io_disable_sqo_submit() */ - if (tctx->sqpoll) - __io_uring_files_cancel(NULL); + if (tctx->sqpoll) { + struct file *file; + unsigned long index; + + xa_for_each(&tctx->xa, index, file) + io_uring_cancel_sqpoll(file->private_data); + } do { /* read completions before cancelations */ @@ -9143,8 +9232,10 @@ static int io_uring_flush(struct file *file, void *data) struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx = file->private_data; - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { io_uring_cancel_task_requests(ctx, NULL); + io_req_caches_free(ctx, current); + } if (!tctx) return 0; @@ -9453,11 +9544,8 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); for (i = 0; has_lock && i < ctx->nr_user_files; i++) { - struct fixed_file_table *table; - struct file *f; + struct file *f = *io_fixed_file_slot(ctx->file_data, i); - table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; - f = table->files[i & IORING_FILE_TABLE_MASK]; if (f) seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); else @@ -9613,7 +9701,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, struct user_struct *user = NULL; struct io_ring_ctx *ctx; struct file *file; - bool limit_mem; int ret; if (!entries) @@ -9654,26 +9741,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, } user = get_uid(current_user()); - limit_mem = !capable(CAP_IPC_LOCK); - - if (limit_mem) { - ret = __io_account_mem(user, - ring_pages(p->sq_entries, p->cq_entries)); - if (ret) { - free_uid(user); - return ret; - } - } ctx = io_ring_ctx_alloc(p); if (!ctx) { - if (limit_mem) - __io_unaccount_mem(user, ring_pages(p->sq_entries, - p->cq_entries)); free_uid(user); return -ENOMEM; } ctx->compat = in_compat_syscall(); + ctx->limit_mem = !capable(CAP_IPC_LOCK); ctx->user = user; ctx->creds = get_current_cred(); #ifdef CONFIG_AUDIT @@ -9709,17 +9784,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; } #endif - - /* - * Account memory _before_ installing the file descriptor. Once - * the descriptor is installed, it can get closed at any time. Also - * do this before hitting the general error path, as ring freeing - * will un-account as well. - */ - io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), - ACCT_LOCKED); - ctx->limit_mem = limit_mem; - ret = io_allocate_scq_urings(ctx, p); if (ret) goto err; @@ -9877,21 +9941,6 @@ static int io_register_personality(struct io_ring_ctx *ctx) return ret; } -static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) -{ - struct io_identity *iod; - - iod = idr_remove(&ctx->personality_idr, id); - if (iod) { - put_cred(iod->creds); - if (refcount_dec_and_test(&iod->count)) - kfree(iod); - return 0; - } - - return -EINVAL; -} - static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args) { @@ -10049,13 +10098,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, switch (opcode) { case IORING_REGISTER_BUFFERS: - ret = io_sqe_buffer_register(ctx, arg, nr_args); + ret = io_sqe_buffers_register(ctx, arg, nr_args); break; case IORING_UNREGISTER_BUFFERS: ret = -EINVAL; if (arg || nr_args) break; - ret = io_sqe_buffer_unregister(ctx); + ret = io_sqe_buffers_unregister(ctx); break; case IORING_REGISTER_FILES: ret = io_sqe_files_register(ctx, arg, nr_args); @@ -10198,7 +10247,8 @@ static int __init io_uring_init(void) BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); - req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT); return 0; }; __initcall(io_uring_init); |