summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2024-09-11 10:42:40 -0600
committerJens Axboe <axboe@kernel.dk>2024-09-11 10:42:40 -0600
commit6d0f8dcb3a634bbee46fcb028c5984c463f47812 (patch)
treee77e8999fa9d79363a9b11832a0d7795a0e8ef4f /io_uring
parent318ad4283a6efea8ce5ec2b3c65b6cb19df6b07e (diff)
parent84eacf177faa605853c58e5b1c0d9544b88c16fd (diff)
Merge branch 'for-6.12/io_uring' into for-6.12/io_uring-discard
* for-6.12/io_uring: (31 commits) io_uring/io-wq: inherit cpuset of cgroup in io worker io_uring/io-wq: do not allow pinning outside of cpuset io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common() io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN io_uring/sqpoll: do not allow pinning outside of cpuset io_uring/eventfd: move refs to refcount_t io_uring: remove unused rsrc_put_fn io_uring: add new line after variable declaration io_uring: add GCOV_PROFILE_URING Kconfig option io_uring/kbuf: add support for incremental buffer consumption io_uring/kbuf: pass in 'len' argument for buffer commit Revert "io_uring: Require zeroed sqe->len on provided-buffers send" io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h io_uring/kbuf: add io_kbuf_commit() helper io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg io_uring: wire up min batch wake timeout io_uring: add support for batch wait timeout io_uring: implement our own schedule timeout handling io_uring: move schedule wait logic into helper io_uring: encapsulate extraneous wait flags into a separate struct ...
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Makefile4
-rw-r--r--io_uring/eventfd.c13
-rw-r--r--io_uring/fdinfo.c14
-rw-r--r--io_uring/io-wq.c25
-rw-r--r--io_uring/io_uring.c212
-rw-r--r--io_uring/io_uring.h12
-rw-r--r--io_uring/kbuf.c96
-rw-r--r--io_uring/kbuf.h94
-rw-r--r--io_uring/napi.c35
-rw-r--r--io_uring/napi.h16
-rw-r--r--io_uring/net.c27
-rw-r--r--io_uring/register.c31
-rw-r--r--io_uring/rsrc.c149
-rw-r--r--io_uring/rsrc.h12
-rw-r--r--io_uring/rw.c19
-rw-r--r--io_uring/sqpoll.c7
16 files changed, 529 insertions, 237 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 61923e11c767..53167bef37d7 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -2,6 +2,10 @@
#
# Makefile for io_uring
+ifdef CONFIG_GCOV_PROFILE_URING
+GCOV_PROFILE := y
+endif
+
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
tctx.o filetable.o rw.o net.o poll.o \
eventfd.o uring_cmd.o openclose.o \
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index b9384503a2b7..e37fddd5d9ce 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -15,7 +15,7 @@ struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async: 1;
struct rcu_head rcu;
- atomic_t refs;
+ refcount_t refs;
atomic_t ops;
};
@@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
- if (atomic_dec_and_test(&ev_fd->refs))
+ if (refcount_dec_and_test(&ev_fd->refs))
io_eventfd_free(rcu);
}
@@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
*/
if (unlikely(!ev_fd))
return;
- if (!atomic_inc_not_zero(&ev_fd->refs))
+ if (!refcount_inc_not_zero(&ev_fd->refs))
return;
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
goto out;
@@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
}
}
out:
- if (atomic_dec_and_test(&ev_fd->refs))
+ if (refcount_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free);
}
@@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd);
+
kfree(ev_fd);
return ret;
}
@@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true;
- atomic_set(&ev_fd->refs, 1);
+ refcount_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0;
@@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
if (ev_fd) {
ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
- if (atomic_dec_and_test(&ev_fd->refs))
+ if (refcount_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free);
return 0;
}
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index b1e0e0d85349..d43e1b5fcb36 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -221,7 +221,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
cqe->user_data, cqe->res, cqe->flags);
}
-
spin_unlock(&ctx->completion_lock);
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (ctx->napi_enabled) {
+ seq_puts(m, "NAPI:\tenabled\n");
+ seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
+ if (ctx->napi_prefer_busy_poll)
+ seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
+ else
+ seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
+ } else {
+ seq_puts(m, "NAPI:\tdisabled\n");
+ }
+#endif
}
#endif
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index f1e7c670add8..a38f36b68060 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/rculist_nulls.h>
#include <linux/cpu.h>
+#include <linux/cpuset.h>
#include <linux/task_work.h>
#include <linux/audit.h>
#include <linux/mmu_context.h>
@@ -1167,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL))
goto err;
- cpumask_copy(wq->cpu_mask, cpu_possible_mask);
+ cpuset_cpus_allowed(data->task, wq->cpu_mask);
wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
wq->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC);
@@ -1322,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
{
+ cpumask_var_t allowed_mask;
+ int ret = 0;
+
if (!tctx || !tctx->io_wq)
return -EINVAL;
+ if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
+ return -ENOMEM;
+
rcu_read_lock();
- if (mask)
- cpumask_copy(tctx->io_wq->cpu_mask, mask);
- else
- cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask);
+ cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask);
+ if (mask) {
+ if (cpumask_subset(mask, allowed_mask))
+ cpumask_copy(tctx->io_wq->cpu_mask, mask);
+ else
+ ret = -EINVAL;
+ } else {
+ cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask);
+ }
rcu_read_unlock();
- return 0;
+ free_cpumask_var(allowed_mask);
+ return ret;
}
/*
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3942db160f18..1aca501efaf6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -904,7 +904,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
lockdep_assert_held(&req->ctx->uring_lock);
req_set_fail(req);
- io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
+ io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED));
if (def->fail)
def->fail(req);
io_req_complete_defer(req);
@@ -2350,22 +2350,92 @@ static bool current_pending_io(void)
return percpu_counter_read_positive(&tctx->inflight);
}
-/* when returns >0, the caller should retry */
-static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
- struct io_wait_queue *iowq)
+static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
{
- int ret;
+ struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
- if (unlikely(READ_ONCE(ctx->check_cq)))
- return 1;
- if (unlikely(!llist_empty(&ctx->work_llist)))
- return 1;
- if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
- return 1;
- if (unlikely(task_sigpending(current)))
- return -EINTR;
- if (unlikely(io_should_wake(iowq)))
- return 0;
+ WRITE_ONCE(iowq->hit_timeout, 1);
+ iowq->min_timeout = 0;
+ wake_up_process(iowq->wq.private);
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Doing min_timeout portion. If we saw any timeouts, events, or have work,
+ * wake up. If not, and we have a normal timeout, switch to that and keep
+ * sleeping.
+ */
+static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
+{
+ struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
+ struct io_ring_ctx *ctx = iowq->ctx;
+
+ /* no general timeout, or shorter (or equal), we are done */
+ if (iowq->timeout == KTIME_MAX ||
+ ktime_compare(iowq->min_timeout, iowq->timeout) >= 0)
+ goto out_wake;
+ /* work we may need to run, wake function will see if we need to wake */
+ if (io_has_work(ctx))
+ goto out_wake;
+ /* got events since we started waiting, min timeout is done */
+ if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
+ goto out_wake;
+ /* if we have any events and min timeout expired, we're done */
+ if (io_cqring_events(ctx))
+ goto out_wake;
+
+ /*
+ * If using deferred task_work running and application is waiting on
+ * more than one request, ensure we reset it now where we are switching
+ * to normal sleeps. Any request completion post min_wait should wake
+ * the task and return.
+ */
+ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+ atomic_set(&ctx->cq_wait_nr, 1);
+ smp_mb();
+ if (!llist_empty(&ctx->work_llist))
+ goto out_wake;
+ }
+
+ iowq->t.function = io_cqring_timer_wakeup;
+ hrtimer_set_expires(timer, iowq->timeout);
+ return HRTIMER_RESTART;
+out_wake:
+ return io_cqring_timer_wakeup(timer);
+}
+
+static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
+ clockid_t clock_id, ktime_t start_time)
+{
+ ktime_t timeout;
+
+ hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
+ if (iowq->min_timeout) {
+ timeout = ktime_add_ns(iowq->min_timeout, start_time);
+ iowq->t.function = io_cqring_min_timer_wakeup;
+ } else {
+ timeout = iowq->timeout;
+ iowq->t.function = io_cqring_timer_wakeup;
+ }
+
+ hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
+ hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
+
+ if (!READ_ONCE(iowq->hit_timeout))
+ schedule();
+
+ hrtimer_cancel(&iowq->t);
+ destroy_hrtimer_on_stack(&iowq->t);
+ __set_current_state(TASK_RUNNING);
+
+ return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
+}
+
+static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq,
+ ktime_t start_time)
+{
+ int ret = 0;
/*
* Mark us as being in io_wait if we have pending requests, so cpufreq
@@ -2374,25 +2444,50 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
*/
if (current_pending_io())
current->in_iowait = 1;
- ret = 0;
- if (iowq->timeout == KTIME_MAX)
+ if (iowq->timeout != KTIME_MAX || iowq->min_timeout)
+ ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
+ else
schedule();
- else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
- ret = -ETIME;
current->in_iowait = 0;
return ret;
}
+/* If this returns > 0, the caller should retry */
+static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq,
+ ktime_t start_time)
+{
+ if (unlikely(READ_ONCE(ctx->check_cq)))
+ return 1;
+ if (unlikely(!llist_empty(&ctx->work_llist)))
+ return 1;
+ if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
+ return 1;
+ if (unlikely(task_sigpending(current)))
+ return -EINTR;
+ if (unlikely(io_should_wake(iowq)))
+ return 0;
+
+ return __io_cqring_wait_schedule(ctx, iowq, start_time);
+}
+
+struct ext_arg {
+ size_t argsz;
+ struct __kernel_timespec __user *ts;
+ const sigset_t __user *sig;
+ ktime_t min_time;
+};
+
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
*/
-static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
- const sigset_t __user *sig, size_t sigsz,
- struct __kernel_timespec __user *uts)
+static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
+ struct ext_arg *ext_arg)
{
struct io_wait_queue iowq;
struct io_rings *rings = ctx->rings;
+ ktime_t start_time;
int ret;
if (!io_allowed_run_tw(ctx))
@@ -2410,30 +2505,33 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
iowq.ctx = ctx;
- iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
+ iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
+ iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+ iowq.hit_timeout = 0;
+ iowq.min_timeout = ext_arg->min_time;
iowq.timeout = KTIME_MAX;
+ start_time = io_get_time(ctx);
- if (uts) {
+ if (ext_arg->ts) {
struct timespec64 ts;
- ktime_t dt;
- if (get_timespec64(&ts, uts))
+ if (get_timespec64(&ts, ext_arg->ts))
return -EFAULT;
- dt = timespec64_to_ktime(ts);
- iowq.timeout = ktime_add(dt, ktime_get());
- io_napi_adjust_timeout(ctx, &iowq, dt);
+ iowq.timeout = timespec64_to_ktime(ts);
+ if (!(flags & IORING_ENTER_ABS_TIMER))
+ iowq.timeout = ktime_add(iowq.timeout, start_time);
}
- if (sig) {
+ if (ext_arg->sig) {
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
- ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- sigsz);
+ ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig,
+ ext_arg->argsz);
else
#endif
- ret = set_user_sigmask(sig, sigsz);
+ ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz);
if (ret)
return ret;
@@ -2443,8 +2541,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
trace_io_uring_cqring_wait(ctx, min_events);
do {
- int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
unsigned long check_cq;
+ int nr_wait;
+
+ /* if min timeout has been hit, don't reset wait count */
+ if (!iowq.hit_timeout)
+ nr_wait = (int) iowq.cq_tail -
+ READ_ONCE(ctx->rings->cq.tail);
+ else
+ nr_wait = 1;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
atomic_set(&ctx->cq_wait_nr, nr_wait);
@@ -2454,7 +2559,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
TASK_INTERRUPTIBLE);
}
- ret = io_cqring_wait_schedule(ctx, &iowq);
+ ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
__set_current_state(TASK_RUNNING);
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
@@ -3112,9 +3217,8 @@ static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t a
return 0;
}
-static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
- struct __kernel_timespec __user **ts,
- const sigset_t __user **sig)
+static int io_get_ext_arg(unsigned flags, const void __user *argp,
+ struct ext_arg *ext_arg)
{
struct io_uring_getevents_arg arg;
@@ -3123,8 +3227,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz
* is just a pointer to the sigset_t.
*/
if (!(flags & IORING_ENTER_EXT_ARG)) {
- *sig = (const sigset_t __user *) argp;
- *ts = NULL;
+ ext_arg->sig = (const sigset_t __user *) argp;
+ ext_arg->ts = NULL;
return 0;
}
@@ -3132,15 +3236,14 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz
* EXT_ARG is set - ensure we agree on the size of it and copy in our
* timespec and sigset_t pointers if good.
*/
- if (*argsz != sizeof(arg))
+ if (ext_arg->argsz != sizeof(arg))
return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
- if (arg.pad)
- return -EINVAL;
- *sig = u64_to_user_ptr(arg.sigmask);
- *argsz = arg.sigmask_sz;
- *ts = u64_to_user_ptr(arg.ts);
+ ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC;
+ ext_arg->sig = u64_to_user_ptr(arg.sigmask);
+ ext_arg->argsz = arg.sigmask_sz;
+ ext_arg->ts = u64_to_user_ptr(arg.ts);
return 0;
}
@@ -3154,7 +3257,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
- IORING_ENTER_REGISTERED_RING)))
+ IORING_ENTER_REGISTERED_RING |
+ IORING_ENTER_ABS_TIMER)))
return -EINVAL;
/*
@@ -3245,15 +3349,14 @@ iopoll_locked:
}
mutex_unlock(&ctx->uring_lock);
} else {
- const sigset_t __user *sig;
- struct __kernel_timespec __user *ts;
+ struct ext_arg ext_arg = { .argsz = argsz };
- ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+ ret2 = io_get_ext_arg(flags, argp, &ext_arg);
if (likely(!ret2)) {
min_complete = min(min_complete,
ctx->cq_entries);
- ret2 = io_cqring_wait(ctx, min_complete, sig,
- argsz, ts);
+ ret2 = io_cqring_wait(ctx, min_complete, flags,
+ &ext_arg);
}
}
@@ -3424,6 +3527,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
if (!ctx)
return -ENOMEM;
+ ctx->clockid = CLOCK_MONOTONIC;
+ ctx->clock_offset = 0;
+
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
!(ctx->flags & IORING_SETUP_IOPOLL) &&
!(ctx->flags & IORING_SETUP_SQPOLL))
@@ -3535,7 +3641,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
- IORING_FEAT_RECVSEND_BUNDLE;
+ IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index c2acf6180845..65078e641390 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -39,8 +39,12 @@ struct io_wait_queue {
struct wait_queue_entry wq;
struct io_ring_ctx *ctx;
unsigned cq_tail;
+ unsigned cq_min_tail;
unsigned nr_timeouts;
+ int hit_timeout;
+ ktime_t min_timeout;
ktime_t timeout;
+ struct hrtimer t;
#ifdef CONFIG_NET_RX_BUSY_POLL
ktime_t napi_busy_poll_dt;
@@ -437,6 +441,14 @@ static inline bool io_file_can_poll(struct io_kiocb *req)
return false;
}
+static inline ktime_t io_get_time(struct io_ring_ctx *ctx)
+{
+ if (ctx->clockid == CLOCK_MONOTONIC)
+ return ktime_get();
+
+ return ktime_get_with_offset(ctx->clock_offset);
+}
+
enum {
IO_CHECK_CQ_OVERFLOW_BIT,
IO_CHECK_CQ_DROPPED_BIT,
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index bdfa30b38321..d407576ddfb7 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -70,7 +70,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
return true;
}
-void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
+void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags)
{
/*
* We can add this buffer back to two lists:
@@ -88,12 +88,12 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock);
- __io_put_kbuf_list(req, &ctx->io_buffers_comp);
+ __io_put_kbuf_list(req, len, &ctx->io_buffers_comp);
spin_unlock(&ctx->completion_lock);
} else {
lockdep_assert_held(&req->ctx->uring_lock);
- __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
+ __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache);
}
}
@@ -132,12 +132,6 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
return 1;
}
-static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br,
- __u16 head, __u16 mask)
-{
- return &br->bufs[head & mask];
-}
-
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl,
unsigned int issue_flags)
@@ -171,9 +165,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
* the transfer completes (or if we get -EAGAIN and must poll of
* retry).
*/
- req->flags &= ~REQ_F_BUFFERS_COMMIT;
+ io_kbuf_commit(req, bl, *len, 1);
req->buf_list = NULL;
- bl->head++;
}
return u64_to_user_ptr(buf->addr);
}
@@ -189,7 +182,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
bl = io_buffer_get_list(ctx, req->buf_index);
if (likely(bl)) {
- if (bl->is_buf_ring)
+ if (bl->flags & IOBL_BUF_RING)
ret = io_ring_buffer_select(req, len, bl, issue_flags);
else
ret = io_provided_buffer_select(req, len, bl);
@@ -219,14 +212,25 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
buf = io_ring_head_to_buf(br, head, bl->mask);
if (arg->max_len) {
u32 len = READ_ONCE(buf->len);
- size_t needed;
if (unlikely(!len))
return -ENOBUFS;
- needed = (arg->max_len + len - 1) / len;
- needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
- if (nr_avail > needed)
- nr_avail = needed;
+ /*
+ * Limit incremental buffers to 1 segment. No point trying
+ * to peek ahead and map more than we need, when the buffers
+ * themselves should be large when setup with
+ * IOU_PBUF_RING_INC.
+ */
+ if (bl->flags & IOBL_INC) {
+ nr_avail = 1;
+ } else {
+ size_t needed;
+
+ needed = (arg->max_len + len - 1) / len;
+ needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
+ if (nr_avail > needed)
+ nr_avail = needed;
+ }
}
/*
@@ -251,16 +255,21 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
req->buf_index = buf->bid;
do {
- /* truncate end piece, if needed */
- if (buf->len > arg->max_len)
- buf->len = arg->max_len;
+ u32 len = buf->len;
+
+ /* truncate end piece, if needed, for non partial buffers */
+ if (len > arg->max_len) {
+ len = arg->max_len;
+ if (!(bl->flags & IOBL_INC))
+ buf->len = len;
+ }
iov->iov_base = u64_to_user_ptr(buf->addr);
- iov->iov_len = buf->len;
+ iov->iov_len = len;
iov++;
- arg->out_len += buf->len;
- arg->max_len -= buf->len;
+ arg->out_len += len;
+ arg->max_len -= len;
if (!arg->max_len)
break;
@@ -287,7 +296,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
if (unlikely(!bl))
goto out_unlock;
- if (bl->is_buf_ring) {
+ if (bl->flags & IOBL_BUF_RING) {
ret = io_ring_buffers_peek(req, arg, bl);
/*
* Don't recycle these buffers if we need to go through poll.
@@ -297,8 +306,8 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
* committed them, they cannot be put back in the queue.
*/
if (ret > 0) {
- req->flags |= REQ_F_BL_NO_RECYCLE;
- req->buf_list->head += ret;
+ req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
+ io_kbuf_commit(req, bl, arg->out_len, ret);
}
} else {
ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
@@ -320,7 +329,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
if (unlikely(!bl))
return -ENOENT;
- if (bl->is_buf_ring) {
+ if (bl->flags & IOBL_BUF_RING) {
ret = io_ring_buffers_peek(req, arg, bl);
if (ret > 0)
req->flags |= REQ_F_BUFFERS_COMMIT;
@@ -340,22 +349,22 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (!nbufs)
return 0;
- if (bl->is_buf_ring) {
+ if (bl->flags & IOBL_BUF_RING) {
i = bl->buf_ring->tail - bl->head;
if (bl->buf_nr_pages) {
int j;
- if (!bl->is_mmap) {
+ if (!(bl->flags & IOBL_MMAP)) {
for (j = 0; j < bl->buf_nr_pages; j++)
unpin_user_page(bl->buf_pages[j]);
}
io_pages_unmap(bl->buf_ring, &bl->buf_pages,
- &bl->buf_nr_pages, bl->is_mmap);
- bl->is_mmap = 0;
+ &bl->buf_nr_pages, bl->flags & IOBL_MMAP);
+ bl->flags &= ~IOBL_MMAP;
}
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
- bl->is_buf_ring = 0;
+ bl->flags &= ~IOBL_BUF_RING;
return i;
}
@@ -442,7 +451,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
if (bl) {
ret = -EINVAL;
/* can't use provide/remove buffers command on mapped buffers */
- if (!bl->is_buf_ring)
+ if (!(bl->flags & IOBL_BUF_RING))
ret = __io_remove_buffers(ctx, bl, p->nbufs);
}
io_ring_submit_unlock(ctx, issue_flags);
@@ -589,7 +598,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
}
}
/* can't add buffers via this command for a mapped buffer ring */
- if (bl->is_buf_ring) {
+ if (bl->flags & IOBL_BUF_RING) {
ret = -EINVAL;
goto err;
}
@@ -641,8 +650,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages;
bl->buf_ring = br;
- bl->is_buf_ring = 1;
- bl->is_mmap = 0;
+ bl->flags |= IOBL_BUF_RING;
+ bl->flags &= ~IOBL_MMAP;
return 0;
error_unpin:
unpin_user_pages(pages, nr_pages);
@@ -665,8 +674,7 @@ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
return -ENOMEM;
}
- bl->is_buf_ring = 1;
- bl->is_mmap = 1;
+ bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
return 0;
}
@@ -683,7 +691,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
- if (reg.flags & ~IOU_PBUF_RING_MMAP)
+ if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL;
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (!reg.ring_addr)
@@ -705,7 +713,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
- if (bl->is_buf_ring || !list_empty(&bl->buf_list))
+ if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
return -EEXIST;
} else {
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
@@ -721,6 +729,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!ret) {
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;
+ if (reg.flags & IOU_PBUF_RING_INC)
+ bl->flags |= IOBL_INC;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
@@ -747,7 +757,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
bl = io_buffer_get_list(ctx, reg.bgid);
if (!bl)
return -ENOENT;
- if (!bl->is_buf_ring)
+ if (!(bl->flags & IOBL_BUF_RING))
return -EINVAL;
xa_erase(&ctx->io_bl_xa, bl->bgid);
@@ -771,7 +781,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
bl = io_buffer_get_list(ctx, buf_status.buf_group);
if (!bl)
return -ENOENT;
- if (!bl->is_buf_ring)
+ if (!(bl->flags & IOBL_BUF_RING))
return -EINVAL;
buf_status.head = bl->head;
@@ -802,7 +812,7 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
bl = xa_load(&ctx->io_bl_xa, bgid);
/* must be a mmap'able buffer ring and have pages */
ret = false;
- if (bl && bl->is_mmap)
+ if (bl && bl->flags & IOBL_MMAP)
ret = atomic_inc_not_zero(&bl->refs);
rcu_read_unlock();
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index b90aca3a57fa..36aadfe5ac00 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -4,6 +4,16 @@
#include <uapi/linux/io_uring.h>
+enum {
+ /* ring mapped provided buffers */
+ IOBL_BUF_RING = 1,
+ /* ring mapped provided buffers, but mmap'ed by application */
+ IOBL_MMAP = 2,
+ /* buffers are consumed incrementally rather than always fully */
+ IOBL_INC = 4,
+
+};
+
struct io_buffer_list {
/*
* If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
@@ -25,12 +35,9 @@ struct io_buffer_list {
__u16 head;
__u16 mask;
- atomic_t refs;
+ __u16 flags;
- /* ring mapped provided buffers */
- __u8 is_buf_ring;
- /* ring mapped provided buffers, but mmap'ed by application */
- __u8 is_mmap;
+ atomic_t refs;
};
struct io_buffer {
@@ -52,8 +59,8 @@ struct buf_sel_arg {
struct iovec *iovs;
size_t out_len;
size_t max_len;
- int nr_iovs;
- int mode;
+ unsigned short nr_iovs;
+ unsigned short mode;
};
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
@@ -73,7 +80,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
-void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
+void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
@@ -117,25 +124,55 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
return false;
}
-static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr)
+/* Mapped buffer ring, return io_uring_buf from head */
+#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)]
+
+static inline bool io_kbuf_commit(struct io_kiocb *req,
+ struct io_buffer_list *bl, int len, int nr)
+{
+ if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT)))
+ return true;
+
+ req->flags &= ~REQ_F_BUFFERS_COMMIT;
+
+ if (unlikely(len < 0))
+ return true;
+
+ if (bl->flags & IOBL_INC) {
+ struct io_uring_buf *buf;
+
+ buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask);
+ if (WARN_ON_ONCE(len > buf->len))
+ len = buf->len;
+ buf->len -= len;
+ if (buf->len) {
+ buf->addr += len;
+ return false;
+ }
+ }
+
+ bl->head += nr;
+ return true;
+}
+
+static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
{
struct io_buffer_list *bl = req->buf_list;
+ bool ret = true;
if (bl) {
- if (req->flags & REQ_F_BUFFERS_COMMIT) {
- bl->head += nr;
- req->flags &= ~REQ_F_BUFFERS_COMMIT;
- }
+ ret = io_kbuf_commit(req, bl, len, nr);
req->buf_index = bl->bgid;
}
req->flags &= ~REQ_F_BUFFER_RING;
+ return ret;
}
-static inline void __io_put_kbuf_list(struct io_kiocb *req,
+static inline void __io_put_kbuf_list(struct io_kiocb *req, int len,
struct list_head *list)
{
if (req->flags & REQ_F_BUFFER_RING) {
- __io_put_kbuf_ring(req, 1);
+ __io_put_kbuf_ring(req, len, 1);
} else {
req->buf_index = req->kbuf->bgid;
list_add(&req->kbuf->list, list);
@@ -150,11 +187,12 @@ static inline void io_kbuf_drop(struct io_kiocb *req)
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return;
- __io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
+ /* len == 0 is fine here, non-ring will always drop all of it */
+ __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp);
}
-static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
- unsigned issue_flags)
+static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len,
+ int nbufs, unsigned issue_flags)
{
unsigned int ret;
@@ -162,22 +200,24 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
return 0;
ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
- if (req->flags & REQ_F_BUFFER_RING)
- __io_put_kbuf_ring(req, nbufs);
- else
- __io_put_kbuf(req, issue_flags);
+ if (req->flags & REQ_F_BUFFER_RING) {
+ if (!__io_put_kbuf_ring(req, len, nbufs))
+ ret |= IORING_CQE_F_BUF_MORE;
+ } else {
+ __io_put_kbuf(req, len, issue_flags);
+ }
return ret;
}
-static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len,
unsigned issue_flags)
{
- return __io_put_kbufs(req, 1, issue_flags);
+ return __io_put_kbufs(req, len, 1, issue_flags);
}
-static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs,
- unsigned issue_flags)
+static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len,
+ int nbufs, unsigned issue_flags)
{
- return __io_put_kbufs(req, nbufs, issue_flags);
+ return __io_put_kbufs(req, len, nbufs, issue_flags);
}
#endif
diff --git a/io_uring/napi.c b/io_uring/napi.c
index 1de1d4d62925..d0cf694d0172 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -270,27 +270,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
}
/*
- * __io_napi_adjust_timeout() - adjust busy loop timeout
- * @ctx: pointer to io-uring context structure
- * @iowq: pointer to io wait queue
- * @ts: pointer to timespec or NULL
- *
- * Adjust the busy loop timeout according to timespec and busy poll timeout.
- * If the specified NAPI timeout is bigger than the wait timeout, then adjust
- * the NAPI timeout accordingly.
- */
-void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
- ktime_t to_wait)
-{
- ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
-
- if (to_wait)
- poll_dt = min(poll_dt, to_wait);
-
- iowq->napi_busy_poll_dt = poll_dt;
-}
-
-/*
* __io_napi_busy_loop() - execute busy poll loop
* @ctx: pointer to io-uring context structure
* @iowq: pointer to io wait queue
@@ -299,10 +278,18 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow
*/
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
{
- iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ return;
- if (!(ctx->flags & IORING_SETUP_SQPOLL))
- io_napi_blocking_busy_loop(ctx, iowq);
+ iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
+ if (iowq->timeout != KTIME_MAX) {
+ ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx));
+
+ iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt);
+ }
+
+ iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
+ io_napi_blocking_busy_loop(ctx, iowq);
}
/*
diff --git a/io_uring/napi.h b/io_uring/napi.h
index 27b88c3eb428..fd275ef0456d 100644
--- a/io_uring/napi.h
+++ b/io_uring/napi.h
@@ -17,8 +17,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);
void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock);
-void __io_napi_adjust_timeout(struct io_ring_ctx *ctx,
- struct io_wait_queue *iowq, ktime_t to_wait);
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx);
@@ -27,15 +25,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx)
return !list_empty(&ctx->napi_list);
}
-static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
- struct io_wait_queue *iowq,
- ktime_t to_wait)
-{
- if (!io_napi(ctx))
- return;
- __io_napi_adjust_timeout(ctx, iowq, to_wait);
-}
-
static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
@@ -86,11 +75,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx)
static inline void io_napi_add(struct io_kiocb *req)
{
}
-static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
- struct io_wait_queue *iowq,
- ktime_t to_wait)
-{
-}
static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
diff --git a/io_uring/net.c b/io_uring/net.c
index d08abcca89cc..f10f5a22d66a 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -434,8 +434,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->buf_group = req->buf_index;
req->buf_list = NULL;
}
- if (req->flags & REQ_F_BUFFER_SELECT && sr->len)
- return -EINVAL;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
@@ -499,11 +497,11 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret,
unsigned int cflags;
if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
- cflags = io_put_kbuf(req, issue_flags);
+ cflags = io_put_kbuf(req, *ret, issue_flags);
goto finish;
}
- cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags);
+ cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags);
if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
goto finish;
@@ -599,7 +597,7 @@ retry_bundle:
if (io_do_buffer_select(req)) {
struct buf_sel_arg arg = {
.iovs = &kmsg->fast_iov,
- .max_len = INT_MAX,
+ .max_len = min_not_zero(sr->len, INT_MAX),
.nr_iovs = 1,
};
@@ -618,14 +616,23 @@ retry_bundle:
if (unlikely(ret < 0))
return ret;
- sr->len = arg.out_len;
- iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret,
- arg.out_len);
if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
kmsg->free_iov_nr = ret;
kmsg->free_iov = arg.iovs;
req->flags |= REQ_F_NEED_CLEANUP;
}
+ sr->len = arg.out_len;
+
+ if (ret == 1) {
+ sr->buf = arg.iovs[0].iov_base;
+ ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
+ &kmsg->msg.msg_iter);
+ if (unlikely(ret))
+ return ret;
+ } else {
+ iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
+ arg.iovs, ret, arg.out_len);
+ }
}
/*
@@ -835,13 +842,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (sr->flags & IORING_RECVSEND_BUNDLE) {
- cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
+ cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret),
issue_flags);
/* bundle with no more immediate buffers, we're done */
if (req->flags & REQ_F_BL_EMPTY)
goto finish;
} else {
- cflags |= io_put_kbuf(req, issue_flags);
+ cflags |= io_put_kbuf(req, *ret, issue_flags);
}
/*
diff --git a/io_uring/register.c b/io_uring/register.c
index e3c20be5a198..57cb85c42526 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -335,6 +335,31 @@ err:
return ret;
}
+static int io_register_clock(struct io_ring_ctx *ctx,
+ struct io_uring_clock_register __user *arg)
+{
+ struct io_uring_clock_register reg;
+
+ if (copy_from_user(&reg, arg, sizeof(reg)))
+ return -EFAULT;
+ if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
+ return -EINVAL;
+
+ switch (reg.clockid) {
+ case CLOCK_MONOTONIC:
+ ctx->clock_offset = 0;
+ break;
+ case CLOCK_BOOTTIME:
+ ctx->clock_offset = TK_OFFS_BOOT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ ctx->clockid = reg.clockid;
+ return 0;
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -511,6 +536,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_unregister_napi(ctx, arg);
break;
+ case IORING_REGISTER_CLOCK:
+ ret = -EINVAL;
+ if (!arg || nr_args)
+ break;
+ ret = io_register_clock(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 453867add7ca..e8639993b61e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -855,6 +855,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
+static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
+ struct io_imu_folio_data *data, int nr_folios)
+{
+ struct page **page_array = *pages, **new_array = NULL;
+ int nr_pages_left = *nr_pages, i, j;
+
+ /* Store head pages only*/
+ new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!new_array)
+ return false;
+
+ new_array[0] = compound_head(page_array[0]);
+ /*
+ * The pages are bound to the folio, it doesn't
+ * actually unpin them but drops all but one reference,
+ * which is usually put down by io_buffer_unmap().
+ * Note, needs a better helper.
+ */
+ if (data->nr_pages_head > 1)
+ unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
+
+ j = data->nr_pages_head;
+ nr_pages_left -= data->nr_pages_head;
+ for (i = 1; i < nr_folios; i++) {
+ unsigned int nr_unpin;
+
+ new_array[i] = page_array[j];
+ nr_unpin = min_t(unsigned int, nr_pages_left - 1,
+ data->nr_pages_mid - 1);
+ if (nr_unpin)
+ unpin_user_pages(&page_array[j+1], nr_unpin);
+ j += data->nr_pages_mid;
+ nr_pages_left -= data->nr_pages_mid;
+ }
+ kvfree(page_array);
+ *pages = new_array;
+ *nr_pages = nr_folios;
+ return true;
+}
+
+static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
+ struct io_imu_folio_data *data)
+{
+ struct page **page_array = *pages;
+ struct folio *folio = page_folio(page_array[0]);
+ unsigned int count = 1, nr_folios = 1;
+ int i;
+
+ if (*nr_pages <= 1)
+ return false;
+
+ data->nr_pages_mid = folio_nr_pages(folio);
+ if (data->nr_pages_mid == 1)
+ return false;
+
+ data->folio_shift = folio_shift(folio);
+ /*
+ * Check if pages are contiguous inside a folio, and all folios have
+ * the same page count except for the head and tail.
+ */
+ for (i = 1; i < *nr_pages; i++) {
+ if (page_folio(page_array[i]) == folio &&
+ page_array[i] == page_array[i-1] + 1) {
+ count++;
+ continue;
+ }
+
+ if (nr_folios == 1) {
+ if (folio_page_idx(folio, page_array[i-1]) !=
+ data->nr_pages_mid - 1)
+ return false;
+
+ data->nr_pages_head = count;
+ } else if (count != data->nr_pages_mid) {
+ return false;
+ }
+
+ folio = page_folio(page_array[i]);
+ if (folio_size(folio) != (1UL << data->folio_shift) ||
+ folio_page_idx(folio, page_array[i]) != 0)
+ return false;
+
+ count = 1;
+ nr_folios++;
+ }
+ if (nr_folios == 1)
+ data->nr_pages_head = count;
+
+ return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
+}
+
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu,
struct page **last_hpage)
@@ -864,7 +956,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
unsigned long off;
size_t size;
int ret, nr_pages, i;
- struct folio *folio = NULL;
+ struct io_imu_folio_data data;
+ bool coalesced;
*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
if (!iov->iov_base)
@@ -879,31 +972,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done;
}
- /* If it's a huge page, try to coalesce them into a single bvec entry */
- if (nr_pages > 1) {
- folio = page_folio(pages[0]);
- for (i = 1; i < nr_pages; i++) {
- /*
- * Pages must be consecutive and on the same folio for
- * this to work
- */
- if (page_folio(pages[i]) != folio ||
- pages[i] != pages[i - 1] + 1) {
- folio = NULL;
- break;
- }
- }
- if (folio) {
- /*
- * The pages are bound to the folio, it doesn't
- * actually unpin them but drops all but one reference,
- * which is usually put down by io_buffer_unmap().
- * Note, needs a better helper.
- */
- unpin_user_pages(&pages[1], nr_pages - 1);
- nr_pages = 1;
- }
- }
+ /* If it's huge page(s), try to coalesce them into fewer bvec entries */
+ coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
if (!imu)
@@ -915,23 +985,25 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done;
}
- off = (unsigned long) iov->iov_base & ~PAGE_MASK;
size = iov->iov_len;
/* store original address for later verification */
imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
+ imu->folio_shift = PAGE_SHIFT;
+ imu->folio_mask = PAGE_MASK;
+ if (coalesced) {
+ imu->folio_shift = data.folio_shift;
+ imu->folio_mask = ~((1UL << data.folio_shift) - 1);
+ }
+ off = (unsigned long) iov->iov_base & ~imu->folio_mask;
*pimu = imu;
ret = 0;
- if (folio) {
- bvec_set_page(&imu->bvec[0], pages[0], size, off);
- goto done;
- }
for (i = 0; i < nr_pages; i++) {
size_t vec_len;
- vec_len = min_t(size_t, size, PAGE_SIZE - off);
+ vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
off = 0;
size -= vec_len;
@@ -1042,23 +1114,18 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* we know that:
*
* 1) it's a BVEC iter, we set it up
- * 2) all bvecs are PAGE_SIZE in size, except potentially the
+ * 2) all bvecs are the same in size, except potentially the
* first and last bvec
*
* So just find our index, and adjust the iterator afterwards.
* If the offset is within the first bvec (or the whole first
* bvec, just use iov_iter_advance(). This makes it easier
* since we can just skip the first segment, which may not
- * be PAGE_SIZE aligned.
+ * be folio_size aligned.
*/
const struct bio_vec *bvec = imu->bvec;
if (offset < bvec->bv_len) {
- /*
- * Note, huge pages buffers consists of one large
- * bvec entry and should always go this way. The other
- * branch doesn't expect non PAGE_SIZE'd chunks.
- */
iter->bvec = bvec;
iter->count -= offset;
iter->iov_offset = offset;
@@ -1067,12 +1134,12 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
/* skip first vec */
offset -= bvec->bv_len;
- seg_skip = 1 + (offset >> PAGE_SHIFT);
+ seg_skip = 1 + (offset >> imu->folio_shift);
iter->bvec = bvec + seg_skip;
iter->nr_segs -= seg_skip;
iter->count -= bvec->bv_len + offset;
- iter->iov_offset = offset & ~PAGE_MASK;
+ iter->iov_offset = offset & ~imu->folio_mask;
}
}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index c032ca3436ca..3d0dda3556e6 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -22,8 +22,6 @@ struct io_rsrc_put {
};
};
-typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-
struct io_rsrc_data {
struct io_ring_ctx *ctx;
@@ -46,10 +44,20 @@ struct io_mapped_ubuf {
u64 ubuf;
u64 ubuf_end;
unsigned int nr_bvecs;
+ unsigned int folio_shift;
unsigned long acct_pages;
+ unsigned long folio_mask;
struct bio_vec bvec[] __counted_by(nr_bvecs);
};
+struct io_imu_folio_data {
+ /* Head folio can be partially included in the fixed buf */
+ unsigned int nr_pages_head;
+ /* For non-head/tail folios, has to be fully included */
+ unsigned int nr_pages_mid;
+ unsigned int folio_shift;
+};
+
void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index c004d21e2f12..f023ff49c688 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -467,8 +467,7 @@ static void io_req_io_end(struct io_kiocb *req)
static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
if (unlikely(res != req->cqe.res)) {
- if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
- io_rw_should_reissue(req)) {
+ if (res == -EAGAIN && io_rw_should_reissue(req)) {
/*
* Reissue will start accounting again, finish the
* current cycle.
@@ -511,7 +510,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
- req->cqe.flags |= io_put_kbuf(req, 0);
+ req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0);
io_req_rw_cleanup(req, 0);
io_req_task_complete(req, ts);
@@ -593,7 +592,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
*/
io_req_io_end(req);
io_req_set_res(req, final_ret,
- io_put_kbuf(req, issue_flags));
+ io_put_kbuf(req, ret, issue_flags));
io_req_rw_cleanup(req, issue_flags);
return IOU_OK;
}
@@ -855,6 +854,14 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = io_iter_do_read(rw, &io->iter);
+ /*
+ * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT
+ * issue, even though they should be returning -EAGAIN. To be safe,
+ * retry from blocking context for either.
+ */
+ if (ret == -EOPNOTSUPP && force_nonblock)
+ ret = -EAGAIN;
+
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
/* If we can poll, just do that. */
@@ -975,7 +982,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
* Put our buffer and post a CQE. If we fail to post a CQE, then
* jump to the termination path. This request is then done.
*/
- cflags = io_put_kbuf(req, issue_flags);
+ cflags = io_put_kbuf(req, ret, issue_flags);
rw->len = 0; /* similarly to above, reset len to 0 */
if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
@@ -1167,7 +1174,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (!smp_load_acquire(&req->iopoll_completed))
break;
nr_events++;
- req->cqe.flags = io_put_kbuf(req, 0);
+ req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0);
if (req->opcode != IORING_OP_URING_CMD)
io_req_rw_cleanup(req, 0);
}
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 3b50dc9586d1..272df9d00f45 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -10,6 +10,7 @@
#include <linux/slab.h>
#include <linux/audit.h>
#include <linux/security.h>
+#include <linux/cpuset.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
@@ -176,7 +177,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
- if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
+ if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
const struct cred *creds = NULL;
if (ctx->sq_creds != current_cred())
@@ -460,10 +461,12 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
return 0;
if (p->flags & IORING_SETUP_SQ_AFF) {
+ struct cpumask allowed_mask;
int cpu = p->sq_thread_cpu;
ret = -EINVAL;
- if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ cpuset_cpus_allowed(current, &allowed_mask);
+ if (!cpumask_test_cpu(cpu, &allowed_mask))
goto err_sqpoll;
sqd->sq_cpu = cpu;
} else {