summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c17
-rw-r--r--kernel/backtracetest.c18
-rw-r--r--kernel/bpf/cpumap.c2
-rw-r--r--kernel/bpf/helpers.c5
-rw-r--r--kernel/bpf/task_iter.c2
-rw-r--r--kernel/bpf/verifier.c5
-rw-r--r--kernel/cgroup/cpuset.c9
-rw-r--r--kernel/context_tracking.c4
-rw-r--r--kernel/cpu.c9
-rw-r--r--kernel/exit.c41
-rw-r--r--kernel/fork.c148
-rw-r--r--kernel/irq/irq_sim.c28
-rw-r--r--kernel/irq/irqdesc.c112
-rw-r--r--kernel/irq/irqdomain.c28
-rw-r--r--kernel/irq/manage.c109
-rw-r--r--kernel/irq/matrix.c28
-rw-r--r--kernel/irq/msi.c184
-rw-r--r--kernel/kprobes.c4
-rw-r--r--kernel/nsproxy.c2
-rw-r--r--kernel/pid.c57
-rw-r--r--kernel/power/swap.c28
-rw-r--r--kernel/rcu/Kconfig13
-rw-r--r--kernel/rcu/rcu.h19
-rw-r--r--kernel/rcu/rcuscale.c6
-rw-r--r--kernel/rcu/rcutorture.c13
-rw-r--r--kernel/rcu/srcutree.c24
-rw-r--r--kernel/rcu/sync.c16
-rw-r--r--kernel/rcu/tasks.h135
-rw-r--r--kernel/rcu/tiny.c1
-rw-r--r--kernel/rcu/tree.c237
-rw-r--r--kernel/rcu/tree.h20
-rw-r--r--kernel/rcu/tree_exp.h83
-rw-r--r--kernel/rcu/tree_nocb.h69
-rw-r--r--kernel/rcu/tree_plugin.h52
-rw-r--r--kernel/sched/core.c17
-rw-r--r--kernel/sched/membarrier.c6
-rw-r--r--kernel/signal.c110
-rw-r--r--kernel/softirq.c5
-rw-r--r--kernel/sys.c50
-rw-r--r--kernel/time/hrtimer.c3
-rw-r--r--kernel/time/time_test.c2
-rw-r--r--kernel/trace/fprobe.c14
-rw-r--r--kernel/trace/ftrace.c10
-rw-r--r--kernel/trace/ring_buffer.c173
-rw-r--r--kernel/trace/trace.c111
-rw-r--r--kernel/trace/trace_btf.c4
-rw-r--r--kernel/trace/trace_events_synth.c3
-rw-r--r--kernel/trace/trace_output.c6
-rw-r--r--kernel/trace/trace_probe.c32
-rw-r--r--kernel/trace/trace_probe.h3
-rw-r--r--kernel/workqueue.c1849
51 files changed, 2579 insertions, 1347 deletions
diff --git a/kernel/async.c b/kernel/async.c
index 97f224a5257b..4c3e6a44595f 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -64,6 +64,7 @@ static async_cookie_t next_cookie = 1;
static LIST_HEAD(async_global_pending); /* pending from all registered doms */
static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
+static struct workqueue_struct *async_wq;
struct async_entry {
struct list_head domain_list;
@@ -174,7 +175,7 @@ static async_cookie_t __async_schedule_node_domain(async_func_t func,
spin_unlock_irqrestore(&async_lock, flags);
/* schedule for execution */
- queue_work_node(node, system_unbound_wq, &entry->work);
+ queue_work_node(node, async_wq, &entry->work);
return newcookie;
}
@@ -345,3 +346,17 @@ bool current_is_async(void)
return worker && worker->current_func == async_run_entry_fn;
}
EXPORT_SYMBOL_GPL(current_is_async);
+
+void __init async_init(void)
+{
+ /*
+ * Async can schedule a number of interdependent work items. However,
+ * unbound workqueues can handle only upto min_active interdependent
+ * work items. The default min_active of 8 isn't sufficient for async
+ * and can lead to stalls. Let's use a dedicated workqueue with raised
+ * min_active.
+ */
+ async_wq = alloc_workqueue("async", WQ_UNBOUND, 0);
+ BUG_ON(!async_wq);
+ workqueue_set_min_active(async_wq, WQ_DFL_ACTIVE);
+}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index 370217dd7e39..a4181234232b 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -21,24 +21,20 @@ static void backtrace_test_normal(void)
dump_stack();
}
-static DECLARE_COMPLETION(backtrace_work);
-
-static void backtrace_test_irq_callback(unsigned long data)
+static void backtrace_test_bh_workfn(struct work_struct *work)
{
dump_stack();
- complete(&backtrace_work);
}
-static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback);
+static DECLARE_WORK(backtrace_bh_work, &backtrace_test_bh_workfn);
-static void backtrace_test_irq(void)
+static void backtrace_test_bh(void)
{
- pr_info("Testing a backtrace from irq context.\n");
+ pr_info("Testing a backtrace from BH context.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
- init_completion(&backtrace_work);
- tasklet_schedule(&backtrace_tasklet);
- wait_for_completion(&backtrace_work);
+ queue_work(system_bh_wq, &backtrace_bh_work);
+ flush_work(&backtrace_bh_work);
}
#ifdef CONFIG_STACKTRACE
@@ -65,7 +61,7 @@ static int backtrace_regression_test(void)
pr_info("====[ backtrace testing ]===========\n");
backtrace_test_normal();
- backtrace_test_irq();
+ backtrace_test_bh();
backtrace_test_saved();
pr_info("====[ end of backtrace testing ]====\n");
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8a0bb80fe48a..ef82ffc90cbe 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -178,7 +178,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
void **frames, int n,
struct xdp_cpumap_stats *stats)
{
- struct xdp_rxq_info rxq;
+ struct xdp_rxq_info rxq = {};
struct xdp_buff xdp;
int i, nframes = 0;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index be72824f32b2..d19cd863d294 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1101,6 +1101,7 @@ struct bpf_hrtimer {
struct bpf_prog *prog;
void __rcu *callback_fn;
void *value;
+ struct rcu_head rcu;
};
/* the actual struct hidden inside uapi struct bpf_timer */
@@ -1332,6 +1333,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
if (in_nmi())
return -EOPNOTSUPP;
+ rcu_read_lock();
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
if (!t) {
@@ -1353,6 +1355,7 @@ out:
* if it was running.
*/
ret = ret ?: hrtimer_cancel(&t->timer);
+ rcu_read_unlock();
return ret;
}
@@ -1407,7 +1410,7 @@ out:
*/
if (this_cpu_read(hrtimer_running) != t)
hrtimer_cancel(&t->timer);
- kfree(t);
+ kfree_rcu(t, rcu);
}
BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index e5c3500443c6..ec4e97c61eef 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -978,6 +978,8 @@ __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
__alignof__(struct bpf_iter_task));
+ kit->pos = NULL;
+
switch (flags) {
case BPF_TASK_ITER_ALL_THREADS:
case BPF_TASK_ITER_ALL_PROCS:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 65f598694d55..ddea9567f755 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5227,7 +5227,9 @@ BTF_ID(struct, prog_test_ref_kfunc)
#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
#endif
+#ifdef CONFIG_BPF_JIT
BTF_ID(struct, bpf_cpumask)
+#endif
BTF_ID(struct, task_struct)
BTF_SET_END(rcu_protected_types)
@@ -16600,6 +16602,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
{
int i;
+ if (old->callback_depth > cur->callback_depth)
+ return false;
+
for (i = 0; i < MAX_BPF_REG; i++)
if (!regsafe(env, &old->regs[i], &cur->regs[i],
&env->idmap_scratch, exact))
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ba36c073304a..4237c8748715 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2562,7 +2562,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
update_partition_sd_lb(cs, old_prs);
out_free:
free_cpumasks(NULL, &tmp);
- return 0;
+ return retval;
}
/**
@@ -2598,9 +2598,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
return 0;
- if (alloc_cpumasks(NULL, &tmp))
- return -ENOMEM;
-
if (*buf)
compute_effective_exclusive_cpumask(trialcs, NULL);
@@ -2615,6 +2612,9 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval)
return retval;
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
+
if (old_prs) {
if (cpumask_empty(trialcs->effective_xcpus)) {
invalidate = true;
@@ -3897,6 +3897,7 @@ static struct cftype legacy_files[] = {
},
{
+ /* obsolete, may be removed in the future */
.name = "memory_spread_slab",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6ef0b35fc28c..70ae70d03823 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -458,6 +458,8 @@ static __always_inline void context_tracking_recursion_exit(void)
* __ct_user_enter - Inform the context tracking that the CPU is going
* to enter user or guest space mode.
*
+ * @state: userspace context-tracking state to enter.
+ *
* This function must be called right before we switch from the kernel
* to user or guest space, when it's guaranteed the remaining kernel
* instructions to execute won't use any RCU read side critical section
@@ -595,6 +597,8 @@ NOKPROBE_SYMBOL(user_enter_callable);
* __ct_user_exit - Inform the context tracking that the CPU is
* exiting user or guest mode and entering the kernel.
*
+ * @state: userspace context-tracking state being exited from.
+ *
* This function must be called after we entered the kernel from user or
* guest space before any use of RCU read side critical section. This
* potentially include any high level kernel code like syscalls, exceptions,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e6ec3ba4950b..cc4a8068747c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -54,7 +54,6 @@
* @rollback: Perform a rollback
* @single: Single callback invocation
* @bringup: Single callback bringup or teardown selector
- * @cpu: CPU number
* @node: Remote CPU node; for multi-instance, do a
* single entry callback for install/remove
* @last: For multi-instance rollback, remember how far we got
@@ -3005,7 +3004,7 @@ static ssize_t control_show(struct device *dev,
return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
#endif
- return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
+ return sysfs_emit(buf, "%s\n", state);
}
static ssize_t control_store(struct device *dev, struct device_attribute *attr,
@@ -3018,7 +3017,7 @@ static DEVICE_ATTR_RW(control);
static ssize_t active_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
+ return sysfs_emit(buf, "%d\n", sched_smt_active());
}
static DEVICE_ATTR_RO(active);
@@ -3107,10 +3106,10 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
-struct cpumask __cpu_possible_mask __read_mostly
+struct cpumask __cpu_possible_mask __ro_after_init
= {CPU_BITS_ALL};
#else
-struct cpumask __cpu_possible_mask __read_mostly;
+struct cpumask __cpu_possible_mask __ro_after_init;
#endif
EXPORT_SYMBOL(__cpu_possible_mask);
diff --git a/kernel/exit.c b/kernel/exit.c
index 3988a02efaef..41a12630cbbc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
tsk->exit_state = EXIT_ZOMBIE;
+ /*
+ * sub-thread or delay_group_leader(), wake up the
+ * PIDFD_THREAD waiters.
+ */
+ if (!thread_group_empty(tsk))
+ do_notify_pidfd(tsk);
+
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
@@ -1127,17 +1134,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* and nobody can change them.
*
* psig->stats_lock also protects us from our sub-threads
- * which can reap other children at the same time. Until
- * we change k_getrusage()-like users to rely on this lock
- * we have to take ->siglock as well.
+ * which can reap other children at the same time.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- spin_lock_irq(&current->sighand->siglock);
- write_seqlock(&psig->stats_lock);
+ write_seqlock_irq(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1160,8 +1164,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- write_sequnlock(&psig->stats_lock);
- spin_unlock_irq(&current->sighand->siglock);
+ write_sequnlock_irq(&psig->stats_lock);
}
if (wo->wo_rusage)
@@ -1893,30 +1896,6 @@ Efault:
}
#endif
-/**
- * thread_group_exited - check that a thread group has exited
- * @pid: tgid of thread group to be checked.
- *
- * Test if the thread group represented by tgid has exited (all
- * threads are zombies, dead or completely gone).
- *
- * Return: true if the thread group has exited. false otherwise.
- */
-bool thread_group_exited(struct pid *pid)
-{
- struct task_struct *task;
- bool exited;
-
- rcu_read_lock();
- task = pid_task(pid, PIDTYPE_PID);
- exited = !task ||
- (READ_ONCE(task->exit_state) && thread_group_empty(task));
- rcu_read_unlock();
-
- return exited;
-}
-EXPORT_SYMBOL(thread_group_exited);
-
/*
* This needs to be __function_aligned as GCC implicitly makes any
* implementation of abort() cold and drops alignment specified by
diff --git a/kernel/fork.c b/kernel/fork.c
index 0d944e92a43f..39a5046c2f0b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -101,6 +101,8 @@
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
+#include <uapi/linux/pidfd.h>
+#include <linux/pidfs.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -1976,6 +1978,7 @@ static inline void rcu_copy_process(struct task_struct *p)
p->rcu_tasks_holdout = false;
INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
p->rcu_tasks_idle_cpu = -1;
+ INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
p->trc_reader_nesting = 0;
@@ -1985,119 +1988,6 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
-struct pid *pidfd_pid(const struct file *file)
-{
- if (file->f_op == &pidfd_fops)
- return file->private_data;
-
- return ERR_PTR(-EBADF);
-}
-
-static int pidfd_release(struct inode *inode, struct file *file)
-{
- struct pid *pid = file->private_data;
-
- file->private_data = NULL;
- put_pid(pid);
- return 0;
-}
-
-#ifdef CONFIG_PROC_FS
-/**
- * pidfd_show_fdinfo - print information about a pidfd
- * @m: proc fdinfo file
- * @f: file referencing a pidfd
- *
- * Pid:
- * This function will print the pid that a given pidfd refers to in the
- * pid namespace of the procfs instance.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its pid. This is
- * similar to calling getppid() on a process whose parent is outside of
- * its pid namespace.
- *
- * NSpid:
- * If pid namespaces are supported then this function will also print
- * the pid of a given pidfd refers to for all descendant pid namespaces
- * starting from the current pid namespace of the instance, i.e. the
- * Pid field and the first entry in the NSpid field will be identical.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its first NSpid
- * entry and no others will be shown.
- * Note that this differs from the Pid and NSpid fields in
- * /proc/<pid>/status where Pid and NSpid are always shown relative to
- * the pid namespace of the procfs instance. The difference becomes
- * obvious when sending around a pidfd between pid namespaces from a
- * different branch of the tree, i.e. where no ancestral relation is
- * present between the pid namespaces:
- * - create two new pid namespaces ns1 and ns2 in the initial pid
- * namespace (also take care to create new mount namespaces in the
- * new pid namespace and mount procfs)
- * - create a process with a pidfd in ns1
- * - send pidfd from ns1 to ns2
- * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
- * have exactly one entry, which is 0
- */
-static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
-{
- struct pid *pid = f->private_data;
- struct pid_namespace *ns;
- pid_t nr = -1;
-
- if (likely(pid_has_task(pid, PIDTYPE_PID))) {
- ns = proc_pid_ns(file_inode(m->file)->i_sb);
- nr = pid_nr_ns(pid, ns);
- }
-
- seq_put_decimal_ll(m, "Pid:\t", nr);
-
-#ifdef CONFIG_PID_NS
- seq_put_decimal_ll(m, "\nNSpid:\t", nr);
- if (nr > 0) {
- int i;
-
- /* If nr is non-zero it means that 'pid' is valid and that
- * ns, i.e. the pid namespace associated with the procfs
- * instance, is in the pid namespace hierarchy of pid.
- * Start at one below the already printed level.
- */
- for (i = ns->level + 1; i <= pid->level; i++)
- seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
- }
-#endif
- seq_putc(m, '\n');
-}
-#endif
-
-/*
- * Poll support for process exit notification.
- */
-static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
-{
- struct pid *pid = file->private_data;
- __poll_t poll_flags = 0;
-
- poll_wait(file, &pid->wait_pidfd, pts);
-
- /*
- * Inform pollers only when the whole thread group exits.
- * If the thread group leader exits before all other threads in the
- * group, then poll(2) should block, similar to the wait(2) family.
- */
- if (thread_group_exited(pid))
- poll_flags = EPOLLIN | EPOLLRDNORM;
-
- return poll_flags;
-}
-
-const struct file_operations pidfd_fops = {
- .release = pidfd_release,
- .poll = pidfd_poll,
-#ifdef CONFIG_PROC_FS
- .show_fdinfo = pidfd_show_fdinfo,
-#endif
-};
-
/**
* __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
@@ -2131,20 +2021,20 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
int pidfd;
struct file *pidfd_file;
- if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
- return -EINVAL;
-
- pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ pidfd = get_unused_fd_flags(O_CLOEXEC);
if (pidfd < 0)
return pidfd;
- pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- flags | O_RDWR | O_CLOEXEC);
+ pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
if (IS_ERR(pidfd_file)) {
put_unused_fd(pidfd);
return PTR_ERR(pidfd_file);
}
- get_pid(pid); /* held by pidfd_file now */
+ /*
+ * anon_inode_getfile() ignores everything outside of the
+ * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
+ */
+ pidfd_file->f_flags |= (flags & PIDFD_THREAD);
*ret = pidfd_file;
return pidfd;
}
@@ -2158,7 +2048,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
*
- * The helper verifies that @pid is used as a thread group leader.
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
*
* If this function returns successfully the caller is responsible to either
* call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2177,7 +2068,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
*/
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
- if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ bool thread = flags & PIDFD_THREAD;
+
+ if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
return -EINVAL;
return __pidfd_prepare(pid, flags, ret);
@@ -2299,9 +2192,8 @@ __latent_entropy struct task_struct *copy_process(
/*
* - CLONE_DETACHED is blocked so that we can potentially
* reuse it later for CLONE_PIDFD.
- * - CLONE_THREAD is blocked until someone really needs it.
*/
- if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
+ if (clone_flags & CLONE_DETACHED)
return ERR_PTR(-EINVAL);
}
@@ -2524,8 +2416,10 @@ __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
+ int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+
/* Note that no task has been attached to @pid yet. */
- retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
+ retval = __pidfd_prepare(pid, flags, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
@@ -2876,8 +2770,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
* here has the advantage that we don't need to have a separate helper
* to check for legacy clone().
*/
- if ((args->flags & CLONE_PIDFD) &&
- (args->flags & CLONE_PARENT_SETTID) &&
+ if ((clone_flags & CLONE_PIDFD) &&
+ (clone_flags & CLONE_PARENT_SETTID) &&
(args->pidfd == args->parent_tid))
return -EINVAL;
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index dd76323ea3fd..38d6ae651ac7 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -4,10 +4,11 @@
* Copyright (C) 2020 Bartosz Golaszewski <bgolaszewski@baylibre.com>
*/
+#include <linux/cleanup.h>
+#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/irq_sim.h>
#include <linux/irq_work.h>
-#include <linux/interrupt.h>
#include <linux/slab.h>
struct irq_sim_work_ctx {
@@ -19,7 +20,6 @@ struct irq_sim_work_ctx {
};
struct irq_sim_irq_ctx {
- int irqnum;
bool enabled;
struct irq_sim_work_ctx *work_ctx;
};
@@ -164,33 +164,27 @@ static const struct irq_domain_ops irq_sim_domain_ops = {
struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
unsigned int num_irqs)
{
- struct irq_sim_work_ctx *work_ctx;
+ struct irq_sim_work_ctx *work_ctx __free(kfree) =
+ kmalloc(sizeof(*work_ctx), GFP_KERNEL);
- work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL);
if (!work_ctx)
- goto err_out;
+ return ERR_PTR(-ENOMEM);
- work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL);
- if (!work_ctx->pending)
- goto err_free_work_ctx;
+ unsigned long *pending __free(bitmap) = bitmap_zalloc(num_irqs, GFP_KERNEL);
+ if (!pending)
+ return ERR_PTR(-ENOMEM);
work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs,
&irq_sim_domain_ops,
work_ctx);
if (!work_ctx->domain)
- goto err_free_bitmap;
+ return ERR_PTR(-ENOMEM);
work_ctx->irq_count = num_irqs;
work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq);
+ work_ctx->pending = no_free_ptr(pending);
- return work_ctx->domain;
-
-err_free_bitmap:
- bitmap_free(work_ctx->pending);
-err_free_work_ctx:
- kfree(work_ctx);
-err_out:
- return ERR_PTR(-ENOMEM);
+ return no_free_ptr(work_ctx)->domain;
}
EXPORT_SYMBOL_GPL(irq_domain_create_sim);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 371eb1711d34..4c6b32318ce3 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -92,11 +92,23 @@ static void desc_smp_init(struct irq_desc *desc, int node,
#endif
}
+static void free_masks(struct irq_desc *desc)
+{
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ free_cpumask_var(desc->pending_mask);
+#endif
+ free_cpumask_var(desc->irq_common_data.affinity);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ free_cpumask_var(desc->irq_common_data.effective_affinity);
+#endif
+}
+
#else
static inline int
alloc_masks(struct irq_desc *desc, int node) { return 0; }
static inline void
desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
+static inline void free_masks(struct irq_desc *desc) { }
#endif
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
@@ -166,6 +178,39 @@ static void delete_irq_desc(unsigned int irq)
}
#ifdef CONFIG_SPARSE_IRQ
+static const struct kobj_type irq_kobj_type;
+#endif
+
+static int init_desc(struct irq_desc *desc, int irq, int node,
+ unsigned int flags,
+ const struct cpumask *affinity,
+ struct module *owner)
+{
+ desc->kstat_irqs = alloc_percpu(unsigned int);
+ if (!desc->kstat_irqs)
+ return -ENOMEM;
+
+ if (alloc_masks(desc, node)) {
+ free_percpu(desc->kstat_irqs);
+ return -ENOMEM;
+ }
+
+ raw_spin_lock_init(&desc->lock);
+ lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+ mutex_init(&desc->request_mutex);
+ init_waitqueue_head(&desc->wait_for_threads);
+ desc_set_defaults(irq, desc, node, affinity, owner);
+ irqd_set(&desc->irq_data, flags);
+ irq_resend_init(desc);
+#ifdef CONFIG_SPARSE_IRQ
+ kobject_init(&desc->kobj, &irq_kobj_type);
+ init_rcu_head(&desc->rcu);
+#endif
+
+ return 0;
+}
+
+#ifdef CONFIG_SPARSE_IRQ
static void irq_kobj_release(struct kobject *kobj);
@@ -384,21 +429,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
EXPORT_SYMBOL_GPL(irq_to_desc);
#endif
-#ifdef CONFIG_SMP
-static void free_masks(struct irq_desc *desc)
-{
-#ifdef CONFIG_GENERIC_PENDING_IRQ
- free_cpumask_var(desc->pending_mask);
-#endif
- free_cpumask_var(desc->irq_common_data.affinity);
-#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
- free_cpumask_var(desc->irq_common_data.effective_affinity);
-#endif
-}
-#else
-static inline void free_masks(struct irq_desc *desc) { }
-#endif
-
void irq_lock_sparse(void)
{
mutex_lock(&sparse_irq_lock);
@@ -414,36 +444,19 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
struct module *owner)
{
struct irq_desc *desc;
+ int ret;
desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
if (!desc)
return NULL;
- /* allocate based on nr_cpu_ids */
- desc->kstat_irqs = alloc_percpu(unsigned int);
- if (!desc->kstat_irqs)
- goto err_desc;
-
- if (alloc_masks(desc, node))
- goto err_kstat;
- raw_spin_lock_init(&desc->lock);
- lockdep_set_class(&desc->lock, &irq_desc_lock_class);
- mutex_init(&desc->request_mutex);
- init_rcu_head(&desc->rcu);
- init_waitqueue_head(&desc->wait_for_threads);
-
- desc_set_defaults(irq, desc, node, affinity, owner);
- irqd_set(&desc->irq_data, flags);
- kobject_init(&desc->kobj, &irq_kobj_type);
- irq_resend_init(desc);
+ ret = init_desc(desc, irq, node, flags, affinity, owner);
+ if (unlikely(ret)) {
+ kfree(desc);
+ return NULL;
+ }
return desc;
-
-err_kstat:
- free_percpu(desc->kstat_irqs);
-err_desc:
- kfree(desc);
- return NULL;
}
static void irq_kobj_release(struct kobject *kobj)
@@ -583,26 +596,29 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
int __init early_irq_init(void)
{
int count, i, node = first_online_node;
- struct irq_desc *desc;
+ int ret;
init_irq_default_affinity();
printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS);
- desc = irq_desc;
count = ARRAY_SIZE(irq_desc);
for (i = 0; i < count; i++) {
- desc[i].kstat_irqs = alloc_percpu(unsigned int);
- alloc_masks(&desc[i], node);
- raw_spin_lock_init(&desc[i].lock);
- lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
- mutex_init(&desc[i].request_mutex);
- init_waitqueue_head(&desc[i].wait_for_threads);
- desc_set_defaults(i, &desc[i], node, NULL, NULL);
- irq_resend_init(&desc[i]);
+ ret = init_desc(irq_desc + i, i, node, 0, NULL, NULL);
+ if (unlikely(ret))
+ goto __free_desc_res;
}
+
return arch_early_irq_init();
+
+__free_desc_res:
+ while (--i >= 0) {
+ free_masks(irq_desc + i);
+ free_percpu(irq_desc[i].kstat_irqs);
+ }
+
+ return ret;
}
struct irq_desc *irq_to_desc(unsigned int irq)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 0bdef4fe925b..3dd1c871e091 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -29,6 +29,7 @@ static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
unsigned int nr_irqs, int node, void *arg,
bool realloc, const struct irq_affinity_desc *affinity);
static void irq_domain_check_hierarchy(struct irq_domain *domain);
+static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq);
struct irqchip_fwid {
struct fwnode_handle fwnode;
@@ -448,7 +449,7 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
*/
mutex_lock(&irq_domain_mutex);
list_for_each_entry(h, &irq_domain_list, link) {
- if (h->ops->select && fwspec->param_count)
+ if (h->ops->select && bus_token != DOMAIN_BUS_ANY)
rc = h->ops->select(h, fwspec, bus_token);
else if (h->ops->match)
rc = h->ops->match(h, to_of_node(fwnode), bus_token);
@@ -858,8 +859,13 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
}
if (irq_domain_is_hierarchy(domain)) {
- virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE,
- fwspec, false, NULL);
+ if (irq_domain_is_msi_device(domain)) {
+ mutex_unlock(&domain->root->mutex);
+ virq = msi_device_domain_alloc_wired(domain, hwirq, type);
+ mutex_lock(&domain->root->mutex);
+ } else
+ virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE,
+ fwspec, false, NULL);
if (virq <= 0) {
virq = 0;
goto out;
@@ -914,7 +920,7 @@ void irq_dispose_mapping(unsigned int virq)
return;
if (irq_domain_is_hierarchy(domain)) {
- irq_domain_free_irqs(virq, 1);
+ irq_domain_free_one_irq(domain, virq);
} else {
irq_domain_disassociate(domain, virq);
irq_free_desc(virq);
@@ -1755,6 +1761,14 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
irq_free_descs(virq, nr_irqs);
}
+static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq)
+{
+ if (irq_domain_is_msi_device(domain))
+ msi_device_domain_free_wired(domain, virq);
+ else
+ irq_domain_free_irqs(virq, 1);
+}
+
/**
* irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
* @domain: Domain below which interrupts must be allocated
@@ -1907,9 +1921,9 @@ static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
return -EINVAL;
}
-static void irq_domain_check_hierarchy(struct irq_domain *domain)
-{
-}
+static void irq_domain_check_hierarchy(struct irq_domain *domain) { }
+static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { }
+
#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1782f90cd8c6..ad3eaf2ab959 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -192,10 +192,14 @@ void irq_set_thread_affinity(struct irq_desc *desc)
struct irqaction *action;
for_each_action_of_desc(desc, action) {
- if (action->thread)
+ if (action->thread) {
set_bit(IRQTF_AFFINITY, &action->thread_flags);
- if (action->secondary && action->secondary->thread)
+ wake_up_process(action->thread);
+ }
+ if (action->secondary && action->secondary->thread) {
set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags);
+ wake_up_process(action->secondary->thread);
+ }
}
}
@@ -1049,10 +1053,57 @@ static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id)
return IRQ_NONE;
}
-static int irq_wait_for_interrupt(struct irqaction *action)
+#ifdef CONFIG_SMP
+/*
+ * Check whether we need to change the affinity of the interrupt thread.
+ */
+static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
+{
+ cpumask_var_t mask;
+ bool valid = false;
+
+ if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
+ return;
+
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * In case we are out of memory we set IRQTF_AFFINITY again and
+ * try again next time
+ */
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+ set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ return;
+ }
+
+ raw_spin_lock_irq(&desc->lock);
+ /*
+ * This code is triggered unconditionally. Check the affinity
+ * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
+ */
+ if (cpumask_available(desc->irq_common_data.affinity)) {
+ const struct cpumask *m;
+
+ m = irq_data_get_effective_affinity_mask(&desc->irq_data);
+ cpumask_copy(mask, m);
+ valid = true;
+ }
+ raw_spin_unlock_irq(&desc->lock);
+
+ if (valid)
+ set_cpus_allowed_ptr(current, mask);
+ free_cpumask_var(mask);
+}
+#else
+static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
+#endif
+
+static int irq_wait_for_interrupt(struct irq_desc *desc,
+ struct irqaction *action)
{
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
+ irq_thread_check_affinity(desc, action);
if (kthread_should_stop()) {
/* may need to run one last time */
@@ -1129,52 +1180,6 @@ out_unlock:
chip_bus_sync_unlock(desc);
}
-#ifdef CONFIG_SMP
-/*
- * Check whether we need to change the affinity of the interrupt thread.
- */
-static void
-irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
-{
- cpumask_var_t mask;
- bool valid = true;
-
- if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
- return;
-
- /*
- * In case we are out of memory we set IRQTF_AFFINITY again and
- * try again next time
- */
- if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
- set_bit(IRQTF_AFFINITY, &action->thread_flags);
- return;
- }
-
- raw_spin_lock_irq(&desc->lock);
- /*
- * This code is triggered unconditionally. Check the affinity
- * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
- */
- if (cpumask_available(desc->irq_common_data.affinity)) {
- const struct cpumask *m;
-
- m = irq_data_get_effective_affinity_mask(&desc->irq_data);
- cpumask_copy(mask, m);
- } else {
- valid = false;
- }
- raw_spin_unlock_irq(&desc->lock);
-
- if (valid)
- set_cpus_allowed_ptr(current, mask);
- free_cpumask_var(mask);
-}
-#else
-static inline void
-irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
-#endif
-
/*
* Interrupts which are not explicitly requested as threaded
* interrupts rely on the implicit bh/preempt disable of the hard irq
@@ -1312,13 +1317,9 @@ static int irq_thread(void *data)
init_task_work(&on_exit_work, irq_thread_dtor);
task_work_add(current, &on_exit_work, TWA_NONE);
- irq_thread_check_affinity(desc, action);
-
- while (!irq_wait_for_interrupt(action)) {
+ while (!irq_wait_for_interrupt(desc, action)) {
irqreturn_t action_ret;
- irq_thread_check_affinity(desc, action);
-
action_ret = handler_fn(desc, action);
if (action_ret == IRQ_WAKE_THREAD)
irq_wake_secondary(desc, action);
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 75d0ae490e29..8f222d1cccec 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -8,8 +8,6 @@
#include <linux/cpu.h>
#include <linux/irq.h>
-#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS))
-
struct cpumap {
unsigned int available;
unsigned int allocated;
@@ -17,8 +15,8 @@ struct cpumap {
unsigned int managed_allocated;
bool initialized;
bool online;
- unsigned long alloc_map[IRQ_MATRIX_SIZE];
- unsigned long managed_map[IRQ_MATRIX_SIZE];
+ unsigned long *managed_map;
+ unsigned long alloc_map[];
};
struct irq_matrix {
@@ -32,8 +30,8 @@ struct irq_matrix {
unsigned int total_allocated;
unsigned int online_maps;
struct cpumap __percpu *maps;
- unsigned long scratch_map[IRQ_MATRIX_SIZE];
- unsigned long system_map[IRQ_MATRIX_SIZE];
+ unsigned long *system_map;
+ unsigned long scratch_map[];
};
#define CREATE_TRACE_POINTS
@@ -50,24 +48,32 @@ __init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits,
unsigned int alloc_start,
unsigned int alloc_end)
{
+ unsigned int cpu, matrix_size = BITS_TO_LONGS(matrix_bits);
struct irq_matrix *m;
- if (matrix_bits > IRQ_MATRIX_BITS)
- return NULL;
-
- m = kzalloc(sizeof(*m), GFP_KERNEL);
+ m = kzalloc(struct_size(m, scratch_map, matrix_size * 2), GFP_KERNEL);
if (!m)
return NULL;
+ m->system_map = &m->scratch_map[matrix_size];
+
m->matrix_bits = matrix_bits;
m->alloc_start = alloc_start;
m->alloc_end = alloc_end;
m->alloc_size = alloc_end - alloc_start;
- m->maps = alloc_percpu(*m->maps);
+ m->maps = __alloc_percpu(struct_size(m->maps, alloc_map, matrix_size * 2),
+ __alignof__(*m->maps));
if (!m->maps) {
kfree(m);
return NULL;
}
+
+ for_each_possible_cpu(cpu) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+ cm->managed_map = &cm->alloc_map[matrix_size];
+ }
+
return m;
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 79b4a58ba9c3..f90952ebc494 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -726,11 +726,26 @@ static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
+static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fwspec,
+ irq_hw_number_t *hwirq, unsigned int *type)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * This will catch allocations through the regular irqdomain path except
+ * for MSI domains which really support this, e.g. MBIGEN.
+ */
+ if (!info->ops->msi_translate)
+ return -ENOTSUPP;
+ return info->ops->msi_translate(domain, fwspec, hwirq, type);
+}
+
static const struct irq_domain_ops msi_domain_ops = {
.alloc = msi_domain_alloc,
.free = msi_domain_free,
.activate = msi_domain_activate,
.deactivate = msi_domain_deactivate,
+ .translate = msi_domain_translate,
};
static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
@@ -830,8 +845,11 @@ static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0,
fwnode, &msi_domain_ops, info);
- if (domain)
+ if (domain) {
irq_domain_update_bus_token(domain, info->bus_token);
+ if (info->flags & MSI_FLAG_PARENT_PM_DEV)
+ domain->pm_dev = parent->pm_dev;
+ }
return domain;
}
@@ -945,9 +963,9 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
void *chip_data)
{
struct irq_domain *domain, *parent = dev->msi.domain;
- const struct msi_parent_ops *pops;
+ struct fwnode_handle *fwnode, *fwnalloced = NULL;
struct msi_domain_template *bundle;
- struct fwnode_handle *fwnode;
+ const struct msi_parent_ops *pops;
if (!irq_domain_is_msi_parent(parent))
return false;
@@ -970,7 +988,19 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
pops->prefix ? : "", bundle->chip.name, dev_name(dev));
bundle->chip.name = bundle->name;
- fwnode = irq_domain_alloc_named_fwnode(bundle->name);
+ /*
+ * Using the device firmware node is required for wire to MSI
+ * device domains so that the existing firmware results in a domain
+ * match.
+ * All other device domains like PCI/MSI use the named firmware
+ * node as they are not guaranteed to have a fwnode. They are never
+ * looked up and always handled in the context of the device.
+ */
+ if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)
+ fwnode = dev->fwnode;
+ else
+ fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name);
+
if (!fwnode)
goto free_bundle;
@@ -997,7 +1027,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
fail:
msi_unlock_descs(dev);
free_fwnode:
- irq_domain_free_fwnode(fwnode);
+ irq_domain_free_fwnode(fwnalloced);
free_bundle:
kfree(bundle);
return false;
@@ -1431,34 +1461,10 @@ int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int
return msi_domain_alloc_locked(dev, &ctrl);
}
-/**
- * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at
- * a given index - or at the next free index
- *
- * @dev: Pointer to device struct of the device for which the interrupts
- * are allocated
- * @domid: Id of the interrupt domain to operate on
- * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation
- * uses the next free index.
- * @affdesc: Optional pointer to an interrupt affinity descriptor structure
- * @icookie: Optional pointer to a domain specific per instance cookie. If
- * non-NULL the content of the cookie is stored in msi_desc::data.
- * Must be NULL for MSI-X allocations
- *
- * This requires a MSI interrupt domain which lets the core code manage the
- * MSI descriptors.
- *
- * Return: struct msi_map
- *
- * On success msi_map::index contains the allocated index number and
- * msi_map::virq the corresponding Linux interrupt number
- *
- * On failure msi_map::index contains the error code and msi_map::virq
- * is %0.
- */
-struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index,
- const struct irq_affinity_desc *affdesc,
- union msi_instance_cookie *icookie)
+static struct msi_map __msi_domain_alloc_irq_at(struct device *dev, unsigned int domid,
+ unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *icookie)
{
struct msi_ctrl ctrl = { .domid = domid, .nirqs = 1, };
struct irq_domain *domain;
@@ -1466,17 +1472,16 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u
struct msi_desc *desc;
int ret;
- msi_lock_descs(dev);
domain = msi_get_device_domain(dev, domid);
if (!domain) {
map.index = -ENODEV;
- goto unlock;
+ return map;
}
desc = msi_alloc_desc(dev, 1, affdesc);
if (!desc) {
map.index = -ENOMEM;
- goto unlock;
+ return map;
}
if (icookie)
@@ -1485,7 +1490,7 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u
ret = msi_insert_desc(dev, desc, domid, index);
if (ret) {
map.index = ret;
- goto unlock;
+ return map;
}
ctrl.first = ctrl.last = desc->msi_index;
@@ -1498,11 +1503,90 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u
map.index = desc->msi_index;
map.virq = desc->irq;
}
-unlock:
+ return map;
+}
+
+/**
+ * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at
+ * a given index - or at the next free index
+ *
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation
+ * uses the next free index.
+ * @affdesc: Optional pointer to an interrupt affinity descriptor structure
+ * @icookie: Optional pointer to a domain specific per instance cookie. If
+ * non-NULL the content of the cookie is stored in msi_desc::data.
+ * Must be NULL for MSI-X allocations
+ *
+ * This requires a MSI interrupt domain which lets the core code manage the
+ * MSI descriptors.
+ *
+ * Return: struct msi_map
+ *
+ * On success msi_map::index contains the allocated index number and
+ * msi_map::virq the corresponding Linux interrupt number
+ *
+ * On failure msi_map::index contains the error code and msi_map::virq
+ * is %0.
+ */
+struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *icookie)
+{
+ struct msi_map map;
+
+ msi_lock_descs(dev);
+ map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie);
msi_unlock_descs(dev);
return map;
}
+/**
+ * msi_device_domain_alloc_wired - Allocate a "wired" interrupt on @domain
+ * @domain: The domain to allocate on
+ * @hwirq: The hardware interrupt number to allocate for
+ * @type: The interrupt type
+ *
+ * This weirdness supports wire to MSI controllers like MBIGEN.
+ *
+ * @hwirq is the hardware interrupt number which is handed in from
+ * irq_create_fwspec_mapping(). As the wire to MSI domain is sparse, but
+ * sized in firmware, the hardware interrupt number cannot be used as MSI
+ * index. For the underlying irq chip the MSI index is irrelevant and
+ * all it needs is the hardware interrupt number.
+ *
+ * To handle this the MSI index is allocated with MSI_ANY_INDEX and the
+ * hardware interrupt number is stored along with the type information in
+ * msi_desc::cookie so the underlying interrupt chip and domain code can
+ * retrieve it.
+ *
+ * Return: The Linux interrupt number (> 0) or an error code
+ */
+int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq,
+ unsigned int type)
+{
+ unsigned int domid = MSI_DEFAULT_DOMAIN;
+ union msi_instance_cookie icookie = { };
+ struct device *dev = domain->dev;
+ struct msi_map map = { };
+
+ if (WARN_ON_ONCE(!dev || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI))
+ return -EINVAL;
+
+ icookie.value = ((u64)type << 32) | hwirq;
+
+ msi_lock_descs(dev);
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain))
+ map.index = -EINVAL;
+ else
+ map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie);
+ msi_unlock_descs(dev);
+
+ return map.index >= 0 ? map.virq : map.index;
+}
+
static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain,
struct msi_ctrl *ctrl)
{
@@ -1629,6 +1713,30 @@ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
}
/**
+ * msi_device_domain_free_wired - Free a wired interrupt in @domain
+ * @domain: The domain to free the interrupt on
+ * @virq: The Linux interrupt number to free
+ *
+ * This is the counterpart of msi_device_domain_alloc_wired() for the
+ * weird wired to MSI converting domains.
+ */
+void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq)
+{
+ struct msi_desc *desc = irq_get_msi_desc(virq);
+ struct device *dev = domain->dev;
+
+ if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI))
+ return;
+
+ msi_lock_descs(dev);
+ if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) {
+ msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index,
+ desc->msi_index);
+ }
+ msi_unlock_descs(dev);
+}
+
+/**
* msi_get_domain_info - Get the MSI interrupt domain info for @domain
* @domain: The interrupt domain to retrieve data from
*
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d5a0ee40bf66..9d9095e81792 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1993,7 +1993,7 @@ NOKPROBE_SYMBOL(__kretprobe_find_ret_addr);
unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
struct llist_node **cur)
{
- struct kretprobe_instance *ri = NULL;
+ struct kretprobe_instance *ri;
kprobe_opcode_t *ret;
if (WARN_ON_ONCE(!cur))
@@ -2802,7 +2802,7 @@ static int show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
struct kprobe *p, *kp;
- const char *sym = NULL;
+ const char *sym;
unsigned int i = *(loff_t *) v;
unsigned long offset = 0;
char *modname, namebuf[KSYM_NAME_LEN];
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 15781acaac1c..6ec3deec68c2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -573,7 +573,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (proc_ns_file(f.file))
err = validate_ns(&nsset, ns);
else
- err = validate_nsset(&nsset, f.file->private_data);
+ err = validate_nsset(&nsset, pidfd_pid(f.file));
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(current);
diff --git a/kernel/pid.c b/kernel/pid.c
index b52b10865454..99a0c5eb24b8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
+#include <linux/pidfs.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
@@ -65,6 +66,13 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
+#ifdef CONFIG_FS_PID
+/*
+ * Pseudo filesystems start inode numbering after one. We use Reserved
+ * PIDs as a natural offset.
+ */
+static u64 pidfs_ino = RESERVED_PIDS;
+#endif
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -272,6 +280,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
+#ifdef CONFIG_FS_PID
+ pid->stashed = NULL;
+ pid->ino = ++pidfs_ino;
+#endif
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
@@ -349,6 +361,11 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
hlist_del_rcu(&task->pid_links[type]);
*pid_ptr = new;
+ if (type == PIDTYPE_PID) {
+ WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
+ wake_up_all(&pid->wait_pidfd);
+ }
+
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (pid_has_task(pid, tmp))
return;
@@ -391,8 +408,7 @@ void exchange_tids(struct task_struct *left, struct task_struct *right)
void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
- if (type == PIDTYPE_PID)
- new->thread_pid = old->thread_pid;
+ WARN_ON_ONCE(type == PIDTYPE_PID);
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
@@ -552,11 +568,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
* Return the task associated with @pidfd. The function takes a reference on
* the returned task. The caller is responsible for releasing that reference.
*
- * Currently, the process identified by @pidfd is always a thread-group leader.
- * This restriction currently exists for all aspects of pidfds including pidfd
- * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
- * (only supports thread group leaders).
- *
* Return: On success, the task_struct associated with the pidfd.
* On error, a negative errno number will be returned.
*/
@@ -595,7 +606,7 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
-int pidfd_create(struct pid *pid, unsigned int flags)
+static int pidfd_create(struct pid *pid, unsigned int flags)
{
int pidfd;
struct file *pidfd_file;
@@ -615,11 +626,8 @@ int pidfd_create(struct pid *pid, unsigned int flags)
* @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
- * the process identified by @pid. Currently, the process identified by
- * @pid must be a thread-group leader. This restriction currently exists
- * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
- * be used with CLONE_THREAD) and pidfd polling (only supports thread group
- * leaders).
+ * the task identified by @pid. Without PIDFD_THREAD flag the target task
+ * must be a thread-group leader.
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
@@ -629,7 +637,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
int fd;
struct pid *p;
- if (flags & ~PIDFD_NONBLOCK)
+ if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
return -EINVAL;
if (pid <= 0)
@@ -682,7 +690,26 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
up_read(&task->signal->exec_update_lock);
- return file ?: ERR_PTR(-EBADF);
+ if (!file) {
+ /*
+ * It is possible that the target thread is exiting; it can be
+ * either:
+ * 1. before exit_signals(), which gives a real fd
+ * 2. before exit_files() takes the task_lock() gives a real fd
+ * 3. after exit_files() releases task_lock(), ->files is NULL;
+ * this has PF_EXITING, since it was set in exit_signals(),
+ * __pidfd_fget() returns EBADF.
+ * In case 3 we get EBADF, but that really means ESRCH, since
+ * the task is currently exiting and has freed its files
+ * struct, so we fix it up.
+ */
+ if (task->flags & PF_EXITING)
+ file = ERR_PTR(-ESRCH);
+ else
+ file = ERR_PTR(-EBADF);
+ }
+
+ return file;
}
static int pidfd_getfd(struct pid *pid, int fd)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 6053ddddaf65..692f12fe60c1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -222,7 +222,7 @@ int swsusp_swap_in_use(void)
*/
static unsigned short root_swap = 0xffff;
-static struct bdev_handle *hib_resume_bdev_handle;
+static struct file *hib_resume_bdev_file;
struct hib_bio_batch {
atomic_t count;
@@ -276,7 +276,7 @@ static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr,
struct bio *bio;
int error = 0;
- bio = bio_alloc(hib_resume_bdev_handle->bdev, 1, opf,
+ bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf,
GFP_NOIO | __GFP_HIGH);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
@@ -357,14 +357,14 @@ static int swsusp_swap_check(void)
return res;
root_swap = res;
- hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
+ hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device,
BLK_OPEN_WRITE, NULL, NULL);
- if (IS_ERR(hib_resume_bdev_handle))
- return PTR_ERR(hib_resume_bdev_handle);
+ if (IS_ERR(hib_resume_bdev_file))
+ return PTR_ERR(hib_resume_bdev_file);
- res = set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
+ res = set_blocksize(file_bdev(hib_resume_bdev_file), PAGE_SIZE);
if (res < 0)
- bdev_release(hib_resume_bdev_handle);
+ fput(hib_resume_bdev_file);
return res;
}
@@ -1523,10 +1523,10 @@ int swsusp_check(bool exclusive)
void *holder = exclusive ? &swsusp_holder : NULL;
int error;
- hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
+ hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device,
BLK_OPEN_READ, holder, NULL);
- if (!IS_ERR(hib_resume_bdev_handle)) {
- set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
+ if (!IS_ERR(hib_resume_bdev_file)) {
+ set_blocksize(file_bdev(hib_resume_bdev_file), PAGE_SIZE);
clear_page(swsusp_header);
error = hib_submit_io(REQ_OP_READ, swsusp_resume_block,
swsusp_header, NULL);
@@ -1551,11 +1551,11 @@ int swsusp_check(bool exclusive)
put:
if (error)
- bdev_release(hib_resume_bdev_handle);
+ fput(hib_resume_bdev_file);
else
pr_debug("Image signature found, resuming\n");
} else {
- error = PTR_ERR(hib_resume_bdev_handle);
+ error = PTR_ERR(hib_resume_bdev_file);
}
if (error)
@@ -1570,12 +1570,12 @@ put:
void swsusp_close(void)
{
- if (IS_ERR(hib_resume_bdev_handle)) {
+ if (IS_ERR(hib_resume_bdev_file)) {
pr_debug("Image device not initialised\n");
return;
}
- bdev_release(hib_resume_bdev_handle);
+ fput(hib_resume_bdev_file);
}
/**
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index bdd7eadb33d8..e7d2dd267593 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -314,6 +314,19 @@ config RCU_LAZY
To save power, batch RCU callbacks and flush after delay, memory
pressure, or callback list growing too big.
+ Requires rcu_nocbs=all to be set.
+
+ Use rcutree.enable_rcu_lazy=0 to turn it off at boot time.
+
+config RCU_LAZY_DEFAULT_OFF
+ bool "Turn RCU lazy invocation off by default"
+ depends on RCU_LAZY
+ default n
+ help
+ Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default
+ off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch
+ it back on.
+
config RCU_DOUBLE_CHECK_CB_TIME
bool "RCU callback-batch backup time check"
depends on RCU_EXPERT
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index f94f65877f2b..86fce206560e 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -528,6 +528,12 @@ struct task_struct *get_rcu_tasks_gp_kthread(void);
struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+#ifdef CONFIG_TASKS_RCU_GENERIC
+void tasks_cblist_init_generic(void);
+#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
+static inline void tasks_cblist_init_generic(void) { }
+#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
+
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
@@ -543,11 +549,11 @@ enum rcutorture_type {
};
#if defined(CONFIG_RCU_LAZY)
-unsigned long rcu_lazy_get_jiffies_till_flush(void);
-void rcu_lazy_set_jiffies_till_flush(unsigned long j);
+unsigned long rcu_get_jiffies_lazy_flush(void);
+void rcu_set_jiffies_lazy_flush(unsigned long j);
#else
-static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
-static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
+static inline unsigned long rcu_get_jiffies_lazy_flush(void) { return 0; }
+static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { }
#endif
#if defined(CONFIG_TREE_RCU)
@@ -623,12 +629,7 @@ int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j);
void rcu_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq;
-#ifdef CONFIG_RCU_EXP_KTHREAD
extern struct kthread_worker *rcu_exp_gp_kworker;
-extern struct kthread_worker *rcu_exp_par_gp_kworker;
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-extern struct workqueue_struct *rcu_par_gp_wq;
-#endif /* CONFIG_RCU_EXP_KTHREAD */
void rcu_gp_slow_register(atomic_t *rgssp);
void rcu_gp_slow_unregister(atomic_t *rgssp);
#endif /* #else #ifdef CONFIG_TINY_RCU */
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index ffdb30495e3c..8db4fedaaa1e 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -764,9 +764,9 @@ kfree_scale_init(void)
if (kfree_by_call_rcu) {
/* do a test to check the timeout. */
- orig_jif = rcu_lazy_get_jiffies_till_flush();
+ orig_jif = rcu_get_jiffies_lazy_flush();
- rcu_lazy_set_jiffies_till_flush(2 * HZ);
+ rcu_set_jiffies_lazy_flush(2 * HZ);
rcu_barrier();
jif_start = jiffies;
@@ -775,7 +775,7 @@ kfree_scale_init(void)
smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1);
- rcu_lazy_set_jiffies_till_flush(orig_jif);
+ rcu_set_jiffies_lazy_flush(orig_jif);
if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7567ca8e743c..45d6b4c3d199 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1368,9 +1368,13 @@ rcu_torture_writer(void *arg)
struct rcu_torture *rp;
struct rcu_torture *old_rp;
static DEFINE_TORTURE_RANDOM(rand);
+ unsigned long stallsdone = jiffies;
bool stutter_waited;
unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+ // If a new stall test is added, this must be adjusted.
+ if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu)
+ stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
if (!can_expedite)
pr_alert("%s" TORTURE_FLAG
@@ -1576,11 +1580,11 @@ rcu_torture_writer(void *arg)
!atomic_read(&rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
- boot_ended)
+ boot_ended &&
+ time_after(jiffies, stallsdone))
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
if (list_empty(&rcu_tortures[i].rtort_free) &&
- rcu_access_pointer(rcu_torture_current) !=
- &rcu_tortures[i]) {
+ rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) {
tracing_off();
show_rcu_gp_kthreads();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
@@ -2441,7 +2445,8 @@ static struct notifier_block rcu_torture_stall_block = {
/*
* CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
- * induces a CPU stall for the time specified by stall_cpu.
+ * induces a CPU stall for the time specified by stall_cpu. If a new
+ * stall test is added, stallsdone in rcu_torture_writer() must be adjusted.
*/
static int rcu_torture_stall(void *args)
{
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0351a4e83529..e4d673fc30f4 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1234,11 +1234,20 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
/*
- * The snapshot for acceleration must be taken _before_ the read of the
- * current gp sequence used for advancing, otherwise advancing may fail
- * and acceleration may then fail too.
+ * It's crucial to capture the snapshot 's' for acceleration before
+ * reading the current gp_seq that is used for advancing. This is
+ * essential because if the acceleration snapshot is taken after a
+ * failed advancement attempt, there's a risk that a grace period may
+ * conclude and a new one may start in the interim. If the snapshot is
+ * captured after this sequence of events, the acceleration snapshot 's'
+ * could be excessively advanced, leading to acceleration failure.
+ * In such a scenario, an 'acceleration leak' can occur, where new
+ * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment.
+ * Also note that encountering advancing failures is a normal
+ * occurrence when the grace period for RCU_WAIT_TAIL is in progress.
*
- * This could happen if:
+ * To see this, consider the following events which occur if
+ * rcu_seq_snap() were to be called after advance:
*
* 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
* RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
@@ -1264,6 +1273,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
if (rhp) {
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ /*
+ * Acceleration can never fail because the base current gp_seq
+ * used for acceleration is <= the value of gp_seq used for
+ * advancing. This means that RCU_NEXT_TAIL segment will
+ * always be able to be emptied by the acceleration into the
+ * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments.
+ */
WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
}
if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index e550f97779b8..86df878a2fee 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -24,22 +24,6 @@ void rcu_sync_init(struct rcu_sync *rsp)
init_waitqueue_head(&rsp->gp_wait);
}
-/**
- * rcu_sync_enter_start - Force readers onto slow path for multiple updates
- * @rsp: Pointer to rcu_sync structure to use for synchronization
- *
- * Must be called after rcu_sync_init() and before first use.
- *
- * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
- * pairs turn into NO-OPs.
- */
-void rcu_sync_enter_start(struct rcu_sync *rsp)
-{
- rsp->gp_count++;
- rsp->gp_state = GP_PASSED;
-}
-
-
static void rcu_sync_func(struct rcu_head *rhp);
static void rcu_sync_call(struct rcu_sync *rsp)
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 732ad5b39946..147b5945d67a 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -32,6 +32,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @rtp_irq_work: IRQ work queue for deferred wakeups.
* @barrier_q_head: RCU callback for barrier operation.
* @rtp_blkd_tasks: List of tasks blocked as readers.
+ * @rtp_exit_list: List of tasks in the latter portion of do_exit().
* @cpu: CPU number corresponding to this entry.
* @rtpp: Pointer to the rcu_tasks structure.
*/
@@ -46,6 +47,7 @@ struct rcu_tasks_percpu {
struct irq_work rtp_irq_work;
struct rcu_head barrier_q_head;
struct list_head rtp_blkd_tasks;
+ struct list_head rtp_exit_list;
int cpu;
struct rcu_tasks *rtpp;
};
@@ -144,8 +146,6 @@ static struct rcu_tasks rt_name = \
}
#ifdef CONFIG_TASKS_RCU
-/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
@@ -240,7 +240,6 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
static void cblist_init_generic(struct rcu_tasks *rtp)
{
int cpu;
- unsigned long flags;
int lim;
int shift;
@@ -266,15 +265,15 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
WARN_ON_ONCE(!rtpcp);
if (cpu)
raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
- local_irq_save(flags); // serialize initialization
if (rcu_segcblist_empty(&rtpcp->cblist))
rcu_segcblist_init(&rtpcp->cblist);
- local_irq_restore(flags);
INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
rtpcp->cpu = cpu;
rtpcp->rtpp = rtp;
if (!rtpcp->rtp_blkd_tasks.next)
INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ if (!rtpcp->rtp_exit_list.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
}
pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
@@ -851,10 +850,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// number of voluntary context switches, and add that task to the
// holdout list.
// rcu_tasks_postscan():
-// Invoke synchronize_srcu() to ensure that all tasks that were
-// in the process of exiting (and which thus might not know to
-// synchronize with this RCU Tasks grace period) have completed
-// exiting.
+// Gather per-CPU lists of tasks in do_exit() to ensure that all
+// tasks that were in the process of exiting (and which thus might
+// not know to synchronize with this RCU Tasks grace period) have
+// completed exiting. The synchronize_rcu() in rcu_tasks_postgp()
+// will take care of any tasks stuck in the non-preemptible region
+// of do_exit() following its call to exit_tasks_rcu_stop().
// check_all_holdout_tasks(), repeatedly until holdout list is empty:
// Scans the holdout list, attempting to identify a quiescent state
// for each task on the list. If there is a quiescent state, the
@@ -867,8 +868,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// with interrupts disabled.
//
// For each exiting task, the exit_tasks_rcu_start() and
-// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
-// read-side critical sections waited for by rcu_tasks_postscan().
+// exit_tasks_rcu_finish() functions add and remove, respectively, the
+// current task to a per-CPU list of tasks that rcu_tasks_postscan() must
+// wait on. This is necessary because rcu_tasks_postscan() must wait on
+// tasks that have already been removed from the global list of tasks.
//
// Pre-grace-period update-side code is ordered before the grace
// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code
@@ -932,9 +935,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
}
}
+void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
+DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+
/* Processing between scanning taskslist and draining the holdout list. */
static void rcu_tasks_postscan(struct list_head *hop)
{
+ int cpu;
int rtsi = READ_ONCE(rcu_task_stall_info);
if (!IS_ENABLED(CONFIG_TINY_RCU)) {
@@ -948,9 +955,9 @@ static void rcu_tasks_postscan(struct list_head *hop)
* this, divide the fragile exit path part in two intersecting
* read side critical sections:
*
- * 1) An _SRCU_ read side starting before calling exit_notify(),
- * which may remove the task from the tasklist, and ending after
- * the final preempt_disable() call in do_exit().
+ * 1) A task_struct list addition before calling exit_notify(),
+ * which may remove the task from the tasklist, with the
+ * removal after the final preempt_disable() call in do_exit().
*
* 2) An _RCU_ read side starting with the final preempt_disable()
* call in do_exit() and ending with the final call to schedule()
@@ -959,7 +966,37 @@ static void rcu_tasks_postscan(struct list_head *hop)
* This handles the part 1). And postgp will handle part 2) with a
* call to synchronize_rcu().
*/
- synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ for_each_possible_cpu(cpu) {
+ unsigned long j = jiffies + 1;
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu);
+ struct task_struct *t;
+ struct task_struct *t1;
+ struct list_head tmp;
+
+ raw_spin_lock_irq_rcu_node(rtpcp);
+ list_for_each_entry_safe(t, t1, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) {
+ if (list_empty(&t->rcu_tasks_holdout_list))
+ rcu_tasks_pertask(t, hop);
+
+ // RT kernels need frequent pauses, otherwise
+ // pause at least once per pair of jiffies.
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && time_before(jiffies, j))
+ continue;
+
+ // Keep our place in the list while pausing.
+ // Nothing else traverses this list, so adding a
+ // bare list_head is OK.
+ list_add(&tmp, &t->rcu_tasks_exit_list);
+ raw_spin_unlock_irq_rcu_node(rtpcp);
+ cond_resched(); // For CONFIG_PREEMPT=n kernels
+ raw_spin_lock_irq_rcu_node(rtpcp);
+ t1 = list_entry(tmp.next, struct task_struct, rcu_tasks_exit_list);
+ list_del(&tmp);
+ j = jiffies + 1;
+ }
+ raw_spin_unlock_irq_rcu_node(rtpcp);
+ }
if (!IS_ENABLED(CONFIG_TINY_RCU))
del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
@@ -1027,7 +1064,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
*
* In addition, this synchronize_rcu() waits for exiting tasks
* to complete their final preempt_disable() region of execution,
- * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
* enforcing the whole region before tasklist removal until
* the final schedule() with TASK_DEAD state to be an RCU TASKS
* read side critical section.
@@ -1035,9 +1071,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
synchronize_rcu();
}
-void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
-DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
-
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
{
#ifndef CONFIG_TINY_RCU
@@ -1118,7 +1151,6 @@ module_param(rcu_tasks_lazy_ms, int, 0444);
static int __init rcu_spawn_tasks_kthread(void)
{
- cblist_init_generic(&rcu_tasks);
rcu_tasks.gp_sleep = HZ / 10;
rcu_tasks.init_fract = HZ / 10;
if (rcu_tasks_lazy_ms >= 0)
@@ -1147,25 +1179,48 @@ struct task_struct *get_rcu_tasks_gp_kthread(void)
EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Protect against tasklist scan blind spot while the task is exiting and
+ * may be removed from the tasklist. Do this by adding the task to yet
+ * another list.
+ *
+ * Note that the task will remove itself from this list, so there is no
+ * need for get_task_struct(), except in the case where rcu_tasks_pertask()
+ * adds it to the holdout list, in which case rcu_tasks_pertask() supplies
+ * the needed get_task_struct().
*/
-void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_start(void)
{
- current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ struct task_struct *t = current;
+
+ WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list));
+ preempt_disable();
+ rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu);
+ t->rcu_tasks_exit_cpu = smp_processor_id();
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (!rtpcp->rtp_exit_list.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+ list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ preempt_enable();
}
/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Remove the task from the "yet another list" because do_exit() is now
+ * non-preemptible, allowing synchronize_rcu() to wait beyond this point.
*/
-void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_stop(void)
{
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
struct task_struct *t = current;
- __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
+ WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list));
+ rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_del_init(&t->rcu_tasks_exit_list);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
}
/*
@@ -1282,7 +1337,6 @@ module_param(rcu_tasks_rude_lazy_ms, int, 0444);
static int __init rcu_spawn_tasks_rude_kthread(void)
{
- cblist_init_generic(&rcu_tasks_rude);
rcu_tasks_rude.gp_sleep = HZ / 10;
if (rcu_tasks_rude_lazy_ms >= 0)
rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms);
@@ -1914,7 +1968,6 @@ module_param(rcu_tasks_trace_lazy_ms, int, 0444);
static int __init rcu_spawn_tasks_trace_kthread(void)
{
- cblist_init_generic(&rcu_tasks_trace);
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
rcu_tasks_trace.gp_sleep = HZ / 10;
rcu_tasks_trace.init_fract = HZ / 10;
@@ -2086,6 +2139,24 @@ late_initcall(rcu_tasks_verify_schedule_work);
static void rcu_tasks_initiate_self_tests(void) { }
#endif /* #else #ifdef CONFIG_PROVE_RCU */
+void __init tasks_cblist_init_generic(void)
+{
+ lockdep_assert_irqs_disabled();
+ WARN_ON(num_online_cpus() > 1);
+
+#ifdef CONFIG_TASKS_RCU
+ cblist_init_generic(&rcu_tasks);
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ cblist_init_generic(&rcu_tasks_rude);
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ cblist_init_generic(&rcu_tasks_trace);
+#endif
+}
+
void __init rcu_init_tasks_generic(void)
{
#ifdef CONFIG_TASKS_RCU
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index fec804b79080..705c0d16850a 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -261,4 +261,5 @@ void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
rcu_early_boot_tests();
+ tasks_cblist_init_generic();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b2bccfd37c38..d9642dd06c25 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -145,7 +145,7 @@ static int rcu_scheduler_fully_active __read_mostly;
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static struct task_struct *rcu_boost_task(struct rcu_node *rnp);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
@@ -2145,6 +2145,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
* Extract the list of ready callbacks, disabling IRQs to prevent
* races with call_rcu() from interrupt handlers. Leave the
* callback counts, as rcu_barrier() needs to be conservative.
+ *
+ * Callbacks execution is fully ordered against preceding grace period
+ * completion (materialized by rnp->gp_seq update) thanks to the
+ * smp_mb__after_unlock_lock() upon node locking required for callbacks
+ * advancing. In NOCB mode this ordering is then further relayed through
+ * the nocb locking that protects both callbacks advancing and extraction.
*/
rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -2591,12 +2597,26 @@ static int __init rcu_spawn_core_kthreads(void)
return 0;
}
+static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func)
+{
+ rcu_segcblist_enqueue(&rdp->cblist, head);
+ if (__is_kvfree_rcu_offset((unsigned long)func))
+ trace_rcu_kvfree_callback(rcu_state.name, head,
+ (unsigned long)func,
+ rcu_segcblist_n_cbs(&rdp->cblist));
+ else
+ trace_rcu_callback(rcu_state.name, head,
+ rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
+}
+
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
*/
-static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
- unsigned long flags)
+static void call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags)
{
+ rcutree_enqueue(rdp, head, func);
/*
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
@@ -2692,7 +2712,6 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
unsigned long flags;
bool lazy;
struct rcu_data *rdp;
- bool was_alldone;
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
@@ -2729,30 +2748,18 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
}
check_cb_ovld(rdp);
- if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
- return; // Enqueued onto ->nocb_bypass, so just leave.
- // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
- rcu_segcblist_enqueue(&rdp->cblist, head);
- if (__is_kvfree_rcu_offset((unsigned long)func))
- trace_rcu_kvfree_callback(rcu_state.name, head,
- (unsigned long)func,
- rcu_segcblist_n_cbs(&rdp->cblist));
- else
- trace_rcu_callback(rcu_state.name, head,
- rcu_segcblist_n_cbs(&rdp->cblist));
- trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
-
- /* Go handle any RCU core processing required. */
- if (unlikely(rcu_rdp_is_offloaded(rdp))) {
- __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
- } else {
- __call_rcu_core(rdp, head, flags);
- local_irq_restore(flags);
- }
+ if (unlikely(rcu_rdp_is_offloaded(rdp)))
+ call_rcu_nocb(rdp, head, func, flags, lazy);
+ else
+ call_rcu_core(rdp, head, func, flags);
+ local_irq_restore(flags);
}
#ifdef CONFIG_RCU_LAZY
+static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF);
+module_param(enable_rcu_lazy, bool, 0444);
+
/**
* call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
* flush all lazy callbacks (including the new one) to the main ->cblist while
@@ -2778,6 +2785,8 @@ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
__call_rcu_common(head, func, false);
}
EXPORT_SYMBOL_GPL(call_rcu_hurry);
+#else
+#define enable_rcu_lazy false
#endif
/**
@@ -2826,7 +2835,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
+ __call_rcu_common(head, func, enable_rcu_lazy);
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -4394,6 +4403,66 @@ rcu_boot_init_percpu_data(int cpu)
rcu_boot_init_nocb_percpu_data(rdp);
}
+struct kthread_worker *rcu_exp_gp_kworker;
+
+static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
+{
+ struct kthread_worker *kworker;
+ const char *name = "rcu_exp_par_gp_kthread_worker/%d";
+ struct sched_param param = { .sched_priority = kthread_prio };
+ int rnp_index = rnp - rcu_get_root();
+
+ if (rnp->exp_kworker)
+ return;
+
+ kworker = kthread_create_worker(0, name, rnp_index);
+ if (IS_ERR_OR_NULL(kworker)) {
+ pr_err("Failed to create par gp kworker on %d/%d\n",
+ rnp->grplo, rnp->grphi);
+ return;
+ }
+ WRITE_ONCE(rnp->exp_kworker, kworker);
+
+ if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+ sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
+}
+
+static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
+{
+ struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker);
+
+ if (!kworker)
+ return NULL;
+
+ return kworker->task;
+}
+
+static void __init rcu_start_exp_gp_kworker(void)
+{
+ const char *name = "rcu_exp_gp_kthread_worker";
+ struct sched_param param = { .sched_priority = kthread_prio };
+
+ rcu_exp_gp_kworker = kthread_create_worker(0, name);
+ if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+ pr_err("Failed to create %s!\n", name);
+ rcu_exp_gp_kworker = NULL;
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+}
+
+static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp)
+{
+ if (rcu_scheduler_fully_active) {
+ mutex_lock(&rnp->kthread_mutex);
+ rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_exp_par_gp_kworker(rnp);
+ mutex_unlock(&rnp->kthread_mutex);
+ }
+}
+
/*
* Invoked early in the CPU-online process, when pretty much all services
* are available. The incoming CPU is not present.
@@ -4442,7 +4511,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_rnp_kthreads(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
@@ -4450,13 +4519,64 @@ int rcutree_prepare_cpu(unsigned int cpu)
}
/*
- * Update RCU priority boot kthread affinity for CPU-hotplug changes.
+ * Update kthreads affinity during CPU-hotplug changes.
+ *
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question. The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU. If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ *
+ * Any future concurrent calls are serialized via ->kthread_mutex.
*/
-static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
+static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ cpumask_var_t cm;
+ unsigned long mask;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct task_struct *task_boost, *task_exp;
+
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ rnp = rdp->mynode;
+
+ task_boost = rcu_boost_task(rnp);
+ task_exp = rcu_exp_par_gp_task(rnp);
+
+ /*
+ * If CPU is the boot one, those tasks are created later from early
+ * initcall since kthreadd must be created first.
+ */
+ if (!task_boost && !task_exp)
+ return;
+
+ if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
+ return;
+
+ mutex_lock(&rnp->kthread_mutex);
+ mask = rcu_rnp_online_cpus(rnp);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
+ cpu != outgoingcpu)
+ cpumask_set_cpu(cpu, cm);
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (cpumask_empty(cm)) {
+ cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (outgoingcpu >= 0)
+ cpumask_clear_cpu(outgoingcpu, cm);
+ }
+
+ if (task_exp)
+ set_cpus_allowed_ptr(task_exp, cm);
+
+ if (task_boost)
+ set_cpus_allowed_ptr(task_boost, cm);
- rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
+ mutex_unlock(&rnp->kthread_mutex);
+
+ free_cpumask_var(cm);
}
/*
@@ -4640,8 +4760,9 @@ void rcutree_migrate_callbacks(int cpu)
__call_rcu_nocb_wake(my_rdp, true, flags);
} else {
rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
- raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
+ raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
}
+ local_irq_restore(flags);
if (needwake)
rcu_gp_kthread_wake();
lockdep_assert_irqs_enabled();
@@ -4730,51 +4851,6 @@ static int rcu_pm_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-#ifdef CONFIG_RCU_EXP_KTHREAD
-struct kthread_worker *rcu_exp_gp_kworker;
-struct kthread_worker *rcu_exp_par_gp_kworker;
-
-static void __init rcu_start_exp_gp_kworkers(void)
-{
- const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
- const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
- struct sched_param param = { .sched_priority = kthread_prio };
-
- rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
- if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
- pr_err("Failed to create %s!\n", gp_kworker_name);
- return;
- }
-
- rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
- if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
- pr_err("Failed to create %s!\n", par_gp_kworker_name);
- kthread_destroy_worker(rcu_exp_gp_kworker);
- return;
- }
-
- sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
- sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
- &param);
-}
-
-static inline void rcu_alloc_par_gp_wq(void)
-{
-}
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-struct workqueue_struct *rcu_par_gp_wq;
-
-static void __init rcu_start_exp_gp_kworkers(void)
-{
-}
-
-static inline void rcu_alloc_par_gp_wq(void)
-{
- rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
- WARN_ON(!rcu_par_gp_wq);
-}
-#endif /* CONFIG_RCU_EXP_KTHREAD */
-
/*
* Spawn the kthreads that handle RCU's grace periods.
*/
@@ -4809,10 +4885,10 @@ static int __init rcu_spawn_gp_kthread(void)
* due to rcu_scheduler_fully_active.
*/
rcu_spawn_cpu_nocb_kthread(smp_processor_id());
- rcu_spawn_one_boost_kthread(rdp->mynode);
+ rcu_spawn_rnp_kthreads(rdp->mynode);
rcu_spawn_core_kthreads();
/* Create kthread worker for expedited GPs */
- rcu_start_exp_gp_kworkers();
+ rcu_start_exp_gp_kworker();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -4915,7 +4991,7 @@ static void __init rcu_init_one(void)
init_waitqueue_head(&rnp->exp_wq[2]);
init_waitqueue_head(&rnp->exp_wq[3]);
spin_lock_init(&rnp->exp_lock);
- mutex_init(&rnp->boost_kthread_mutex);
+ mutex_init(&rnp->kthread_mutex);
raw_spin_lock_init(&rnp->exp_poll_lock);
rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
@@ -5152,7 +5228,6 @@ void __init rcu_init(void)
/* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
- rcu_alloc_par_gp_wq();
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
@@ -5165,6 +5240,8 @@ void __init rcu_init(void)
(void)start_poll_synchronize_rcu_expedited();
rcu_test_sync_prims();
+
+ tasks_cblist_init_generic();
}
#include "tree_stall.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e9821a8422db..df48160b3136 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -21,14 +21,10 @@
#include "rcu_segcblist.h"
-/* Communicate arguments to a workqueue handler. */
+/* Communicate arguments to a kthread worker handler. */
struct rcu_exp_work {
unsigned long rew_s;
-#ifdef CONFIG_RCU_EXP_KTHREAD
struct kthread_work rew_work;
-#else
- struct work_struct rew_work;
-#endif /* CONFIG_RCU_EXP_KTHREAD */
};
/* RCU's kthread states for tracing. */
@@ -72,6 +68,9 @@ struct rcu_node {
/* Online CPUs for next expedited GP. */
/* Any CPU that has ever been online will */
/* have its bit set. */
+ struct kthread_worker *exp_kworker;
+ /* Workers performing per node expedited GP */
+ /* initialization. */
unsigned long cbovldmask;
/* CPUs experiencing callback overload. */
unsigned long ffmask; /* Fully functional CPUs. */
@@ -113,7 +112,7 @@ struct rcu_node {
/* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
- struct mutex boost_kthread_mutex;
+ struct mutex kthread_mutex;
/* Exclusion for thread spawning and affinity */
/* manipulation. */
struct task_struct *boost_kthread_task;
@@ -467,11 +466,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
unsigned long j, bool lazy);
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags,
- bool lazy);
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
- unsigned long flags);
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy);
+static void __maybe_unused __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 2ac440bc7e10..6b83537480b1 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -198,10 +198,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
}
if (rnp->parent == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (wake) {
- smp_mb(); /* EGP done before wake_up(). */
+ if (wake)
swake_up_one_online(&rcu_state.expedited_wq);
- }
+
break;
}
mask = rnp->grpmask;
@@ -419,7 +418,6 @@ retry_ipi:
static void rcu_exp_sel_wait_wake(unsigned long s);
-#ifdef CONFIG_RCU_EXP_KTHREAD
static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
{
struct rcu_exp_work *rewp =
@@ -428,9 +426,14 @@ static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
__sync_rcu_exp_select_node_cpus(rewp);
}
-static inline bool rcu_gp_par_worker_started(void)
+static inline bool rcu_exp_worker_started(void)
+{
+ return !!READ_ONCE(rcu_exp_gp_kworker);
+}
+
+static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp)
{
- return !!READ_ONCE(rcu_exp_par_gp_kworker);
+ return !!READ_ONCE(rnp->exp_kworker);
}
static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
@@ -441,7 +444,7 @@ static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
* another work item on the same kthread worker can result in
* deadlock.
*/
- kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
+ kthread_queue_work(READ_ONCE(rnp->exp_kworker), &rnp->rew.rew_work);
}
static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
@@ -466,64 +469,6 @@ static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew
kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
}
-static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
-{
-}
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp =
- container_of(wp, struct rcu_exp_work, rew_work);
-
- __sync_rcu_exp_select_node_cpus(rewp);
-}
-
-static inline bool rcu_gp_par_worker_started(void)
-{
- return !!READ_ONCE(rcu_par_gp_wq);
-}
-
-static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
-{
- int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
-
- INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- /* If all offline, queue the work on an unbound CPU. */
- if (unlikely(cpu > rnp->grphi - rnp->grplo))
- cpu = WORK_CPU_UNBOUND;
- else
- cpu += rnp->grplo;
- queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
-}
-
-static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
-{
- flush_work(&rnp->rew.rew_work);
-}
-
-/*
- * Work-queue handler to drive an expedited grace period forward.
- */
-static void wait_rcu_exp_gp(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp;
-
- rewp = container_of(wp, struct rcu_exp_work, rew_work);
- rcu_exp_sel_wait_wake(rewp->rew_s);
-}
-
-static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
-{
- INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew->rew_work);
-}
-
-static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
-{
- destroy_work_on_stack(&rew->rew_work);
-}
-#endif /* CONFIG_RCU_EXP_KTHREAD */
-
/*
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
@@ -541,7 +486,7 @@ static void sync_rcu_exp_select_cpus(void)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- if (!rcu_gp_par_worker_started() ||
+ if (!rcu_exp_par_worker_started(rnp) ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
/* No worker started yet or last leaf, do direct call. */
@@ -956,7 +901,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
*/
void synchronize_rcu_expedited(void)
{
- bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
unsigned long flags;
struct rcu_exp_work rew;
struct rcu_node *rnp;
@@ -996,7 +940,7 @@ void synchronize_rcu_expedited(void)
return; /* Someone else did our work for us. */
/* Ensure that load happens before action based on it. */
- if (unlikely(boottime)) {
+ if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) {
/* Direct call during scheduler init and early_initcalls(). */
rcu_exp_sel_wait_wake(s);
} else {
@@ -1013,9 +957,6 @@ void synchronize_rcu_expedited(void)
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
-
- if (likely(!boottime))
- synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 4efbf7333d4e..3f85577bddd4 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -256,6 +256,7 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
return __wake_nocb_gp(rdp_gp, rdp, force, flags);
}
+#ifdef CONFIG_RCU_LAZY
/*
* LAZY_FLUSH_JIFFIES decides the maximum amount of time that
* can elapse before lazy callbacks are flushed. Lazy callbacks
@@ -264,21 +265,20 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
* left unsubmitted to RCU after those many jiffies.
*/
#define LAZY_FLUSH_JIFFIES (10 * HZ)
-static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
+static unsigned long jiffies_lazy_flush = LAZY_FLUSH_JIFFIES;
-#ifdef CONFIG_RCU_LAZY
// To be called only from test code.
-void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
+void rcu_set_jiffies_lazy_flush(unsigned long jif)
{
- jiffies_till_flush = jif;
+ jiffies_lazy_flush = jif;
}
-EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
+EXPORT_SYMBOL(rcu_set_jiffies_lazy_flush);
-unsigned long rcu_lazy_get_jiffies_till_flush(void)
+unsigned long rcu_get_jiffies_lazy_flush(void)
{
- return jiffies_till_flush;
+ return jiffies_lazy_flush;
}
-EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
+EXPORT_SYMBOL(rcu_get_jiffies_lazy_flush);
#endif
/*
@@ -299,7 +299,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
*/
if (waketype == RCU_NOCB_WAKE_LAZY &&
rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
- mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
+ mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush());
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
@@ -482,7 +482,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
// flush ->nocb_bypass to ->cblist.
if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
(ncbs && bypass_is_lazy &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) ||
ncbs >= qhimark) {
rcu_nocb_lock(rdp);
*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
@@ -532,9 +532,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
// 2. Both of these conditions are met:
// a. The bypass list previously had only lazy CBs, and:
// b. The new CB is non-lazy.
- if (ncbs && (!bypass_is_lazy || lazy)) {
- local_irq_restore(flags);
- } else {
+ if (!ncbs || (bypass_is_lazy && !lazy)) {
// No-CBs GP kthread might be indefinitely asleep, if so, wake.
rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
@@ -544,7 +542,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
} else {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("FirstBQnoWake"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
}
}
return true; // Callback already enqueued.
@@ -566,11 +564,12 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
long lazy_len;
long len;
struct task_struct *t;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
// If we are being polled or there is no kthread, just leave.
t = READ_ONCE(rdp->nocb_gp_kthread);
if (rcu_nocb_poll || !t) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeNotPoll"));
return;
@@ -583,17 +582,17 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
rdp->qlen_last_fqs_check = len;
// Only lazy CBs in bypass list
if (lazy_len && bypass_len == lazy_len) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
TPS("WakeLazy"));
} else if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
wake_nocb_gp(rdp, false);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
TPS("WakeEmptyIsDeferred"));
}
@@ -610,20 +609,32 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
smp_mb(); /* Enqueue before timer_pending(). */
if ((rdp->nocb_cb_sleep ||
!rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp->nocb_timer)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ !timer_pending(&rdp_gp->nocb_timer)) {
+ rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
TPS("WakeOvfIsDeferred"));
} else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
} else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
}
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy)
+{
+ bool was_alldone;
+
+ if (!rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) {
+ /* Not enqueued on bypass but locked, do regular enqueue */
+ rcutree_enqueue(rdp, head, func);
+ __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
+ }
+}
+
static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
bool *wake_state)
{
@@ -723,7 +734,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
lazy_ncbs = READ_ONCE(rdp->lazy_len);
if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) ||
bypass_ncbs > 2 * qhimark)) {
flush_bypass = true;
} else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
@@ -779,7 +790,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
needwake = rdp->nocb_cb_sleep;
WRITE_ONCE(rdp->nocb_cb_sleep, false);
- smp_mb(); /* CB invocation -after- GP end. */
} else {
needwake = false;
}
@@ -933,8 +943,7 @@ static void nocb_cb_wait(struct rcu_data *rdp)
swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
nocb_cb_wait_cond(rdp));
- // VVV Ensure CB invocation follows _sleep test.
- if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ if (READ_ONCE(rdp->nocb_cb_sleep)) {
WARN_ON(signal_pending(current));
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
}
@@ -1383,7 +1392,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
rcu_nocb_unlock_irqrestore(rdp, flags);
continue;
}
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ rcu_nocb_try_flush_bypass(rdp, jiffies);
rcu_nocb_unlock_irqrestore(rdp, flags);
wake_nocb_gp(rdp, false);
sc->nr_to_scan -= _count;
@@ -1768,10 +1777,10 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
return true;
}
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags, bool lazy)
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy)
{
- return false;
+ WARN_ON_ONCE(1); /* Should be dead code! */
}
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 41021080ad25..36a8b5dbf5b5 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1195,14 +1195,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
struct sched_param sp;
struct task_struct *t;
- mutex_lock(&rnp->boost_kthread_mutex);
- if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
- goto out;
+ if (rnp->boost_kthread_task)
+ return;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (WARN_ON_ONCE(IS_ERR(t)))
- goto out;
+ return;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
@@ -1210,48 +1209,11 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
-
- out:
- mutex_unlock(&rnp->boost_kthread_mutex);
}
-/*
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question. The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU. If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
- *
- * Any future concurrent calls are serialized via ->boost_kthread_mutex.
- */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
{
- struct task_struct *t = rnp->boost_kthread_task;
- unsigned long mask;
- cpumask_var_t cm;
- int cpu;
-
- if (!t)
- return;
- if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
- return;
- mutex_lock(&rnp->boost_kthread_mutex);
- mask = rcu_rnp_online_cpus(rnp);
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
- cpu != outgoingcpu)
- cpumask_set_cpu(cpu, cm);
- cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (cpumask_empty(cm)) {
- cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (outgoingcpu >= 0)
- cpumask_clear_cpu(outgoingcpu, cm);
- }
- set_cpus_allowed_ptr(t, cm);
- mutex_unlock(&rnp->boost_kthread_mutex);
- free_cpumask_var(cm);
+ return READ_ONCE(rnp->boost_kthread_task);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1270,10 +1232,10 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
{
+ return NULL;
}
-
#endif /* #else #ifdef CONFIG_RCU_BOOST */
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..540f229700b6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3955,6 +3955,17 @@ void wake_up_if_idle(int cpu)
}
}
+bool cpus_equal_capacity(int this_cpu, int that_cpu)
+{
+ if (!sched_asym_cpucap_active())
+ return true;
+
+ if (this_cpu == that_cpu)
+ return true;
+
+ return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu);
+}
+
bool cpus_share_cache(int this_cpu, int that_cpu)
{
if (this_cpu == that_cpu)
@@ -6787,10 +6798,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
static void sched_update_worker(struct task_struct *tsk)
{
- if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) {
+ if (tsk->flags & PF_BLOCK_TS)
+ blk_plug_invalidate_ts(tsk);
if (tsk->flags & PF_WQ_WORKER)
wq_worker_running(tsk);
- else
+ else if (tsk->flags & PF_IO_WORKER)
io_wq_worker_running(tsk);
}
}
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 2ad881d07752..4e715b9b278e 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -162,6 +162,9 @@
| MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
| MEMBARRIER_CMD_GET_REGISTRATIONS)
+static DEFINE_MUTEX(membarrier_ipi_mutex);
+#define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex)
+
static void ipi_mb(void *info)
{
smp_mb(); /* IPIs should be serializing but paranoid. */
@@ -259,6 +262,7 @@ static int membarrier_global_expedited(void)
if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
+ SERIALIZE_IPI();
cpus_read_lock();
rcu_read_lock();
for_each_online_cpu(cpu) {
@@ -347,6 +351,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
+ SERIALIZE_IPI();
cpus_read_lock();
if (cpu_id >= 0) {
@@ -460,6 +465,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
* between threads which are users of @mm has its membarrier state
* updated.
*/
+ SERIALIZE_IPI();
cpus_read_lock();
rcu_read_lock();
for_each_online_cpu(cpu) {
diff --git a/kernel/signal.c b/kernel/signal.c
index c9c57d053ce4..bdca529f0f7b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -47,6 +47,7 @@
#include <linux/cgroup.h>
#include <linux/audit.h>
#include <linux/sysctl.h>
+#include <uapi/linux/pidfd.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -1436,7 +1437,8 @@ void lockdep_assert_task_sighand_held(struct task_struct *task)
#endif
/*
- * send signal info to all the members of a group
+ * send signal info to all the members of a thread group or to the
+ * individual thread if type == PIDTYPE_PID.
*/
int group_send_sig_info(int sig, struct kernel_siginfo *info,
struct task_struct *p, enum pid_type type)
@@ -1478,7 +1480,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
return ret;
}
-int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
+static int kill_pid_info_type(int sig, struct kernel_siginfo *info,
+ struct pid *pid, enum pid_type type)
{
int error = -ESRCH;
struct task_struct *p;
@@ -1487,11 +1490,10 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (p)
- error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
+ error = group_send_sig_info(sig, info, p, type);
rcu_read_unlock();
if (likely(!p || error != -ESRCH))
return error;
-
/*
* The task was unhashed in between, try again. If it
* is dead, pid_task() will return NULL, if we race with
@@ -1500,6 +1502,11 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
}
}
+int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
+{
+ return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID);
+}
+
static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
int error;
@@ -1898,16 +1905,19 @@ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
return send_sig_info(info.si_signo, &info, t);
}
-int kill_pgrp(struct pid *pid, int sig, int priv)
+static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
int ret;
-
read_lock(&tasklist_lock);
- ret = __kill_pgrp_info(sig, __si_special(priv), pid);
+ ret = __kill_pgrp_info(sig, info, pgrp);
read_unlock(&tasklist_lock);
-
return ret;
}
+
+int kill_pgrp(struct pid *pid, int sig, int priv)
+{
+ return kill_pgrp_info(sig, __si_special(priv), pid);
+}
EXPORT_SYMBOL(kill_pgrp);
int kill_pid(struct pid *pid, int sig, int priv)
@@ -2019,13 +2029,14 @@ ret:
return ret;
}
-static void do_notify_pidfd(struct task_struct *task)
+void do_notify_pidfd(struct task_struct *task)
{
- struct pid *pid;
+ struct pid *pid = task_pid(task);
WARN_ON(task->exit_state == 0);
- pid = task_pid(task);
- wake_up_all(&pid->wait_pidfd);
+
+ __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
+ poll_to_key(EPOLLIN | EPOLLRDNORM));
}
/*
@@ -2050,9 +2061,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
WARN_ON_ONCE(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
-
- /* Wake up all pidfd waiters */
- do_notify_pidfd(tsk);
+ /*
+ * tsk is a group leader and has no threads, wake up the
+ * non-PIDFD_THREAD waiters.
+ */
+ if (thread_group_empty(tsk))
+ do_notify_pidfd(tsk);
if (sig != SIGCHLD) {
/*
@@ -3789,12 +3803,13 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
#endif
#endif
-static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
+static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info,
+ enum pid_type type)
{
clear_siginfo(info);
info->si_signo = sig;
info->si_errno = 0;
- info->si_code = SI_USER;
+ info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER;
info->si_pid = task_tgid_vnr(current);
info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
}
@@ -3808,7 +3823,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct kernel_siginfo info;
- prepare_kill_siginfo(sig, &info);
+ prepare_kill_siginfo(sig, &info, PIDTYPE_TGID);
return kill_something_info(sig, &info, pid);
}
@@ -3861,6 +3876,10 @@ static struct pid *pidfd_to_pid(const struct file *file)
return tgid_pidfd_to_pid(file);
}
+#define PIDFD_SEND_SIGNAL_FLAGS \
+ (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
+ PIDFD_SIGNAL_PROCESS_GROUP)
+
/**
* sys_pidfd_send_signal - Signal a process through a pidfd
* @pidfd: file descriptor of the process
@@ -3868,14 +3887,10 @@ static struct pid *pidfd_to_pid(const struct file *file)
* @info: signal info
* @flags: future flags
*
- * The syscall currently only signals via PIDTYPE_PID which covers
- * kill(<positive-pid>, <signal>. It does not signal threads or process
- * groups.
- * In order to extend the syscall to threads and process groups the @flags
- * argument should be used. In essence, the @flags argument will determine
- * what is signaled and not the file descriptor itself. Put in other words,
- * grouping is a property of the flags argument not a property of the file
- * descriptor.
+ * Send the signal to the thread group or to the individual thread depending
+ * on PIDFD_THREAD.
+ * In the future extension to @flags may be used to override the default scope
+ * of @pidfd.
*
* Return: 0 on success, negative errno on failure
*/
@@ -3886,9 +3901,14 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
struct fd f;
struct pid *pid;
kernel_siginfo_t kinfo;
+ enum pid_type type;
/* Enforce flags be set to 0 until we add an extension. */
- if (flags)
+ if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
+ return -EINVAL;
+
+ /* Ensure that only a single signal scope determining flag is set. */
+ if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
return -EINVAL;
f = fdget(pidfd);
@@ -3906,6 +3926,25 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (!access_pidfd_pidns(pid))
goto err;
+ switch (flags) {
+ case 0:
+ /* Infer scope from the type of pidfd. */
+ if (f.file->f_flags & PIDFD_THREAD)
+ type = PIDTYPE_PID;
+ else
+ type = PIDTYPE_TGID;
+ break;
+ case PIDFD_SIGNAL_THREAD:
+ type = PIDTYPE_PID;
+ break;
+ case PIDFD_SIGNAL_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ break;
+ case PIDFD_SIGNAL_PROCESS_GROUP:
+ type = PIDTYPE_PGID;
+ break;
+ }
+
if (info) {
ret = copy_siginfo_from_user_any(&kinfo, info);
if (unlikely(ret))
@@ -3917,15 +3956,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
/* Only allow sending arbitrary signals to yourself. */
ret = -EPERM;
- if ((task_pid(current) != pid) &&
+ if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
(kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
goto err;
} else {
- prepare_kill_siginfo(sig, &kinfo);
+ prepare_kill_siginfo(sig, &kinfo, type);
}
- ret = kill_pid_info(sig, &kinfo, pid);
-
+ if (type == PIDTYPE_PGID)
+ ret = kill_pgrp_info(sig, &kinfo, pid);
+ else
+ ret = kill_pid_info_type(sig, &kinfo, pid, type);
err:
fdput(f);
return ret;
@@ -3965,12 +4006,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
struct kernel_siginfo info;
- clear_siginfo(&info);
- info.si_signo = sig;
- info.si_errno = 0;
- info.si_code = SI_TKILL;
- info.si_pid = task_tgid_vnr(current);
- info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+ prepare_kill_siginfo(sig, &info, PIDTYPE_PID);
return do_send_specific(tgid, pid, sig, &info);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 210cf5f8d92c..b315b21fb28c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -27,6 +27,7 @@
#include <linux/tick.h>
#include <linux/irq.h>
#include <linux/wait_bit.h>
+#include <linux/workqueue.h>
#include <asm/softirq_stack.h>
@@ -802,11 +803,13 @@ static void tasklet_action_common(struct softirq_action *a,
static __latent_entropy void tasklet_action(struct softirq_action *a)
{
+ workqueue_softirq_action(false);
tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
}
static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
{
+ workqueue_softirq_action(true);
tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
}
@@ -929,6 +932,8 @@ static void run_ksoftirqd(unsigned int cpu)
#ifdef CONFIG_HOTPLUG_CPU
static int takeover_tasklets(unsigned int cpu)
{
+ workqueue_softirq_dead(cpu);
+
/* CPU is dead, so no lock needed. */
local_irq_disable();
diff --git a/kernel/sys.c b/kernel/sys.c
index e219fcfa112d..f8e543f1e38a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1785,21 +1785,24 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
struct task_struct *t;
unsigned long flags;
u64 tgutime, tgstime, utime, stime;
- unsigned long maxrss = 0;
+ unsigned long maxrss;
+ struct mm_struct *mm;
struct signal_struct *sig = p->signal;
+ unsigned int seq = 0;
- memset((char *)r, 0, sizeof (*r));
+retry:
+ memset(r, 0, sizeof(*r));
utime = stime = 0;
+ maxrss = 0;
if (who == RUSAGE_THREAD) {
task_cputime_adjusted(current, &utime, &stime);
accumulate_thread_rusage(p, r);
maxrss = sig->maxrss;
- goto out;
+ goto out_thread;
}
- if (!lock_task_sighand(p, &flags))
- return;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
switch (who) {
case RUSAGE_BOTH:
@@ -1819,9 +1822,6 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
fallthrough;
case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
r->ru_nvcsw += sig->nvcsw;
r->ru_nivcsw += sig->nivcsw;
r->ru_minflt += sig->min_flt;
@@ -1830,28 +1830,42 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
r->ru_oublock += sig->oublock;
if (maxrss < sig->maxrss)
maxrss = sig->maxrss;
+
+ rcu_read_lock();
__for_each_thread(sig, t)
accumulate_thread_rusage(t, r);
+ rcu_read_unlock();
+
break;
default:
BUG();
}
- unlock_task_sighand(p, &flags);
-out:
- r->ru_utime = ns_to_kernel_old_timeval(utime);
- r->ru_stime = ns_to_kernel_old_timeval(stime);
+ if (need_seqretry(&sig->stats_lock, seq)) {
+ seq = 1;
+ goto retry;
+ }
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- if (who != RUSAGE_CHILDREN) {
- struct mm_struct *mm = get_task_mm(p);
+ if (who == RUSAGE_CHILDREN)
+ goto out_children;
- if (mm) {
- setmax_mm_hiwater_rss(&maxrss, mm);
- mmput(mm);
- }
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+
+out_thread:
+ mm = get_task_mm(p);
+ if (mm) {
+ setmax_mm_hiwater_rss(&maxrss, mm);
+ mmput(mm);
}
+
+out_children:
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
+ r->ru_utime = ns_to_kernel_old_timeval(utime);
+ r->ru_stime = ns_to_kernel_old_timeval(stime);
}
SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 760793998cdd..edb0f821dcea 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
enum hrtimer_mode mode)
{
debug_activate(timer, mode);
+ WARN_ON_ONCE(!base->cpu_base->online);
base->cpu_base->active_bases |= 1 << base->index;
@@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->online = 1;
hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
@@ -2250,6 +2252,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
raw_spin_unlock(&new_base->lock);
+ old_base->online = 0;
raw_spin_unlock(&old_base->lock);
return 0;
diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c
index ca058c8af6ba..3e5d422dd15c 100644
--- a/kernel/time/time_test.c
+++ b/kernel/time/time_test.c
@@ -73,7 +73,7 @@ static void time64_to_tm_test_date_range(struct kunit *test)
days = div_s64(secs, 86400);
- #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %ld", \
+ #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %lld", \
year, month, mdday, yday, days
KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG);
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 6cd2a4e3afb8..9ff018245840 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -189,9 +189,6 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
{
int size;
- if (num <= 0)
- return -EINVAL;
-
if (!fp->exit_handler) {
fp->rethook = NULL;
return 0;
@@ -199,15 +196,16 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
/* Initialize rethook if needed */
if (fp->nr_maxactive)
- size = fp->nr_maxactive;
+ num = fp->nr_maxactive;
else
- size = num * num_possible_cpus() * 2;
- if (size <= 0)
+ num *= num_possible_cpus() * 2;
+ if (num <= 0)
return -EINVAL;
+ size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size;
+
/* Initialize rethook */
- fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler,
- sizeof(struct fprobe_rethook_node), size);
+ fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num);
if (IS_ERR(fp->rethook))
return PTR_ERR(fp->rethook);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b01ae7d36021..83ba342aef31 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5325,7 +5325,17 @@ static LIST_HEAD(ftrace_direct_funcs);
static int register_ftrace_function_nolock(struct ftrace_ops *ops);
+/*
+ * If there are multiple ftrace_ops, use SAVE_REGS by default, so that direct
+ * call will be jumped from ftrace_regs_caller. Only if the architecture does
+ * not support ftrace_regs_caller but direct_call, use SAVE_ARGS so that it
+ * jumps from ftrace_caller for multiple ftrace_ops.
+ */
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS
#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS)
+#else
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
+#endif
static int check_direct_multi(struct ftrace_ops *ops)
{
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fd4bfe3ecf01..aa332ace108b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -384,7 +384,6 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
- long wait_index;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -756,8 +755,19 @@ static void rb_wake_up_waiters(struct irq_work *work)
wake_up_all(&rbwork->waiters);
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
+ /* Only cpu_buffer sets the above flags */
+ struct ring_buffer_per_cpu *cpu_buffer =
+ container_of(rbwork, struct ring_buffer_per_cpu, irq_work);
+
+ /* Called from interrupt context */
+ raw_spin_lock(&cpu_buffer->reader_lock);
rbwork->wakeup_full = false;
rbwork->full_waiters_pending = false;
+
+ /* Waking up all waiters, they will reset the shortest full */
+ cpu_buffer->shortest_full = 0;
+ raw_spin_unlock(&cpu_buffer->reader_lock);
+
wake_up_all(&rbwork->full_waiters);
}
}
@@ -798,14 +808,40 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
rbwork = &cpu_buffer->irq_work;
}
- rbwork->wait_index++;
- /* make sure the waiters see the new index */
- smp_wmb();
-
/* This can be called in any context */
irq_work_queue(&rbwork->work);
}
+static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ bool ret = false;
+
+ /* Reads of all CPUs always waits for any data */
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return !ring_buffer_empty(buffer);
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!ring_buffer_empty_cpu(buffer, cpu)) {
+ unsigned long flags;
+ bool pagebusy;
+
+ if (!full)
+ return true;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+ ret = !pagebusy && full_hit(buffer, cpu, full);
+
+ if (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)
+ cpu_buffer->shortest_full = full;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ }
+ return ret;
+}
+
/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
@@ -821,7 +857,6 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
- long wait_index;
int ret = 0;
/*
@@ -840,81 +875,54 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
work = &cpu_buffer->irq_work;
}
- wait_index = READ_ONCE(work->wait_index);
-
- while (true) {
- if (full)
- prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
- else
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
-
- /*
- * The events can happen in critical sections where
- * checking a work queue can cause deadlocks.
- * After adding a task to the queue, this flag is set
- * only to notify events to try to wake up the queue
- * using irq_work.
- *
- * We don't clear it even if the buffer is no longer
- * empty. The flag only causes the next event to run
- * irq_work to do the work queue wake up. The worse
- * that can happen if we race with !trace_empty() is that
- * an event will cause an irq_work to try to wake up
- * an empty queue.
- *
- * There's no reason to protect this flag either, as
- * the work queue and irq_work logic will do the necessary
- * synchronization for the wake ups. The only thing
- * that is necessary is that the wake up happens after
- * a task has been queued. It's OK for spurious wake ups.
- */
- if (full)
- work->full_waiters_pending = true;
- else
- work->waiters_pending = true;
-
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
- break;
-
- if (cpu != RING_BUFFER_ALL_CPUS &&
- !ring_buffer_empty_cpu(buffer, cpu)) {
- unsigned long flags;
- bool pagebusy;
- bool done;
-
- if (!full)
- break;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- done = !pagebusy && full_hit(buffer, cpu, full);
+ if (full)
+ prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
+ else
+ prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
- if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full > full)
- cpu_buffer->shortest_full = full;
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (done)
- break;
- }
+ /*
+ * The events can happen in critical sections where
+ * checking a work queue can cause deadlocks.
+ * After adding a task to the queue, this flag is set
+ * only to notify events to try to wake up the queue
+ * using irq_work.
+ *
+ * We don't clear it even if the buffer is no longer
+ * empty. The flag only causes the next event to run
+ * irq_work to do the work queue wake up. The worse
+ * that can happen if we race with !trace_empty() is that
+ * an event will cause an irq_work to try to wake up
+ * an empty queue.
+ *
+ * There's no reason to protect this flag either, as
+ * the work queue and irq_work logic will do the necessary
+ * synchronization for the wake ups. The only thing
+ * that is necessary is that the wake up happens after
+ * a task has been queued. It's OK for spurious wake ups.
+ */
+ if (full)
+ work->full_waiters_pending = true;
+ else
+ work->waiters_pending = true;
- schedule();
+ if (rb_watermark_hit(buffer, cpu, full))
+ goto out;
- /* Make sure to see the new wait index */
- smp_rmb();
- if (wait_index != work->wait_index)
- break;
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
}
+ schedule();
+ out:
if (full)
finish_wait(&work->full_waiters, &wait);
else
finish_wait(&work->waiters, &wait);
+ if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current))
+ ret = -EINTR;
+
return ret;
}
@@ -937,28 +945,33 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct rb_irq_work *work;
+ struct rb_irq_work *rbwork;
if (cpu == RING_BUFFER_ALL_CPUS) {
- work = &buffer->irq_work;
+ rbwork = &buffer->irq_work;
full = 0;
} else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return EPOLLERR;
cpu_buffer = buffer->buffers[cpu];
- work = &cpu_buffer->irq_work;
+ rbwork = &cpu_buffer->irq_work;
}
if (full) {
- poll_wait(filp, &work->full_waiters, poll_table);
- work->full_waiters_pending = true;
+ unsigned long flags;
+
+ poll_wait(filp, &rbwork->full_waiters, poll_table);
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rbwork->full_waiters_pending = true;
if (!cpu_buffer->shortest_full ||
cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
} else {
- poll_wait(filp, &work->waiters, poll_table);
- work->waiters_pending = true;
+ poll_wait(filp, &rbwork->waiters, poll_table);
+ rbwork->waiters_pending = true;
}
/*
@@ -5877,6 +5890,10 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
if (psize <= BUF_PAGE_HDR_SIZE)
return -EINVAL;
+ /* Size of a subbuf cannot be greater than the write counter */
+ if (psize > RB_WRITE_MASK + 1)
+ return -EINVAL;
+
old_order = buffer->subbuf_order;
old_size = buffer->subbuf_size;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2a7c6fd934e9..c9c898307348 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/panic_notifier.h>
+#include <linux/kmemleak.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
@@ -1532,7 +1533,7 @@ void disable_trace_on_warning(void)
bool tracer_tracing_is_on(struct trace_array *tr)
{
if (tr->array_buffer.buffer)
- return ring_buffer_record_is_on(tr->array_buffer.buffer);
+ return ring_buffer_record_is_set_on(tr->array_buffer.buffer);
return !tr->buffer_disabled;
}
@@ -2320,7 +2321,7 @@ struct saved_cmdlines_buffer {
unsigned *map_cmdline_to_pid;
unsigned cmdline_num;
int cmdline_idx;
- char *saved_cmdlines;
+ char saved_cmdlines[];
};
static struct saved_cmdlines_buffer *savedcmd;
@@ -2334,47 +2335,60 @@ static inline void set_cmdline(int idx, const char *cmdline)
strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}
-static int allocate_cmdlines_buffer(unsigned int val,
- struct saved_cmdlines_buffer *s)
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
+{
+ int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
+
+ kfree(s->map_cmdline_to_pid);
+ kmemleak_free(s);
+ free_pages((unsigned long)s, order);
+}
+
+static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
{
+ struct saved_cmdlines_buffer *s;
+ struct page *page;
+ int orig_size, size;
+ int order;
+
+ /* Figure out how much is needed to hold the given number of cmdlines */
+ orig_size = sizeof(*s) + val * TASK_COMM_LEN;
+ order = get_order(orig_size);
+ size = 1 << (order + PAGE_SHIFT);
+ page = alloc_pages(GFP_KERNEL, order);
+ if (!page)
+ return NULL;
+
+ s = page_address(page);
+ kmemleak_alloc(s, size, 1, GFP_KERNEL);
+ memset(s, 0, sizeof(*s));
+
+ /* Round up to actual allocation */
+ val = (size - sizeof(*s)) / TASK_COMM_LEN;
+ s->cmdline_num = val;
+
s->map_cmdline_to_pid = kmalloc_array(val,
sizeof(*s->map_cmdline_to_pid),
GFP_KERNEL);
- if (!s->map_cmdline_to_pid)
- return -ENOMEM;
-
- s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL);
- if (!s->saved_cmdlines) {
- kfree(s->map_cmdline_to_pid);
- return -ENOMEM;
+ if (!s->map_cmdline_to_pid) {
+ free_saved_cmdlines_buffer(s);
+ return NULL;
}
s->cmdline_idx = 0;
- s->cmdline_num = val;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
val * sizeof(*s->map_cmdline_to_pid));
- return 0;
+ return s;
}
static int trace_create_savedcmd(void)
{
- int ret;
-
- savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
- if (!savedcmd)
- return -ENOMEM;
+ savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
- ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
- if (ret < 0) {
- kfree(savedcmd);
- savedcmd = NULL;
- return -ENOMEM;
- }
-
- return 0;
+ return savedcmd ? 0 : -ENOMEM;
}
int is_tracing_stopped(void)
@@ -6056,26 +6070,14 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
-{
- kfree(s->saved_cmdlines);
- kfree(s->map_cmdline_to_pid);
- kfree(s);
-}
-
static int tracing_resize_saved_cmdlines(unsigned int val)
{
struct saved_cmdlines_buffer *s, *savedcmd_temp;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = allocate_cmdlines_buffer(val);
if (!s)
return -ENOMEM;
- if (allocate_cmdlines_buffer(val, s) < 0) {
- kfree(s);
- return -ENOMEM;
- }
-
preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
@@ -7291,6 +7293,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
return 0;
}
+#define TRACE_MARKER_MAX_SIZE 4096
+
static ssize_t
tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -7318,6 +7322,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if ((ssize_t)cnt < 0)
return -EINVAL;
+ if (cnt > TRACE_MARKER_MAX_SIZE)
+ cnt = TRACE_MARKER_MAX_SIZE;
+
meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */
again:
size = cnt + meta_size;
@@ -7326,11 +7333,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (cnt < FAULTED_SIZE)
size += FAULTED_SIZE - cnt;
- if (size > TRACE_SEQ_BUFFER_SIZE) {
- cnt -= size - TRACE_SEQ_BUFFER_SIZE;
- goto again;
- }
-
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
tracing_gen_ctx());
@@ -8391,6 +8393,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return size;
}
+static int tracing_buffers_flush(struct file *file, fl_owner_t id)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
+ return 0;
+}
+
static int tracing_buffers_release(struct inode *inode, struct file *file)
{
struct ftrace_buffer_info *info = file->private_data;
@@ -8402,12 +8418,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
__trace_array_put(iter->tr);
- iter->wait_index++;
- /* Make sure the waiters see the new wait_index */
- smp_wmb();
-
- ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
-
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
@@ -8623,6 +8633,7 @@ static const struct file_operations tracing_buffers_fops = {
.read = tracing_buffers_read,
.poll = tracing_buffers_poll,
.release = tracing_buffers_release,
+ .flush = tracing_buffers_flush,
.splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
diff --git a/kernel/trace/trace_btf.c b/kernel/trace/trace_btf.c
index ca224d53bfdc..5bbdbcbbde3c 100644
--- a/kernel/trace/trace_btf.c
+++ b/kernel/trace/trace_btf.c
@@ -91,8 +91,8 @@ retry:
for_each_member(i, type, member) {
if (!member->name_off) {
/* Anonymous union/struct: push it for later use */
- type = btf_type_skip_modifiers(btf, member->type, &tid);
- if (type && top < BTF_ANON_STACK_MAX) {
+ if (btf_type_skip_modifiers(btf, member->type, &tid) &&
+ top < BTF_ANON_STACK_MAX) {
anon_stack[top].tid = tid;
anon_stack[top++].offset =
cur_offset + member->offset;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index e7af286af4f1..c82b401a294d 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -441,8 +441,9 @@ static unsigned int trace_string(struct synth_trace_event *entry,
if (is_dynamic) {
union trace_synth_field *data = &entry->fields[*n_u64];
+ len = fetch_store_strlen((unsigned long)str_val);
data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size;
- data->as_dynamic.len = fetch_store_strlen((unsigned long)str_val);
+ data->as_dynamic.len = len;
ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 3e7fa44dc2b2..d8b302d01083 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1587,12 +1587,11 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
- int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
seq_print_ip_sym(s, field->ip, flags);
- trace_seq_printf(s, ": %.*s", max, field->buf);
+ trace_seq_printf(s, ": %s", field->buf);
return trace_handle_return(s);
}
@@ -1601,11 +1600,10 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct print_entry *field;
- int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
- trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf);
+ trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
return trace_handle_return(&iter->seq);
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 4dc74d73fc1d..34289f9c6707 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1159,9 +1159,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
if (!(ctx->flags & TPARG_FL_TEVENT) &&
(strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 ||
strncmp(arg, "\\\"", 2) == 0)) {
- /* The type of $comm must be "string", and not an array. */
- if (parg->count || (t && strcmp(t, "string")))
+ /* The type of $comm must be "string", and not an array type. */
+ if (parg->count || (t && strcmp(t, "string"))) {
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ NEED_STRING_TYPE);
goto out;
+ }
parg->type = find_fetch_type("string", ctx->flags);
} else
parg->type = find_fetch_type(t, ctx->flags);
@@ -1169,18 +1172,6 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE);
goto out;
}
- parg->offset = *size;
- *size += parg->type->size * (parg->count ?: 1);
-
- ret = -ENOMEM;
- if (parg->count) {
- len = strlen(parg->type->fmttype) + 6;
- parg->fmt = kmalloc(len, GFP_KERNEL);
- if (!parg->fmt)
- goto out;
- snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
- parg->count);
- }
code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
if (!code)
@@ -1204,6 +1195,19 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
goto fail;
}
}
+ parg->offset = *size;
+ *size += parg->type->size * (parg->count ?: 1);
+
+ if (parg->count) {
+ len = strlen(parg->type->fmttype) + 6;
+ parg->fmt = kmalloc(len, GFP_KERNEL);
+ if (!parg->fmt) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
+ parg->count);
+ }
ret = -EINVAL;
/* Store operation */
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 850d9ecb6765..c1877d018269 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -515,7 +515,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \
C(NO_BTF_FIELD, "This field is not found."), \
C(BAD_BTF_TID, "Failed to get BTF type info."),\
- C(BAD_TYPE4STR, "This type does not fit for string."),
+ C(BAD_TYPE4STR, "This type does not fit for string."),\
+ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 76e60faed892..a60eb65955e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,6 +29,7 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
+#include <linux/interrupt.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
@@ -53,10 +54,11 @@
#include <linux/nmi.h>
#include <linux/kvm_para.h>
#include <linux/delay.h>
+#include <linux/irq_work.h>
#include "workqueue_internal.h"
-enum {
+enum worker_pool_flags {
/*
* worker_pool flags
*
@@ -72,10 +74,17 @@ enum {
* Note that DISASSOCIATED should be flipped only while holding
* wq_pool_attach_mutex to avoid changing binding state while
* worker_attach_to_pool() is in progress.
+ *
+ * As there can only be one concurrent BH execution context per CPU, a
+ * BH pool is per-CPU and always DISASSOCIATED.
*/
- POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */
+ POOL_BH = 1 << 0, /* is a BH pool */
+ POOL_MANAGER_ACTIVE = 1 << 1, /* being managed */
POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
+ POOL_BH_DRAINING = 1 << 3, /* draining after CPU offline */
+};
+enum worker_flags {
/* worker flags */
WORKER_DIE = 1 << 1, /* die die die */
WORKER_IDLE = 1 << 2, /* is idle */
@@ -86,7 +95,13 @@ enum {
WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |
WORKER_UNBOUND | WORKER_REBOUND,
+};
+enum work_cancel_flags {
+ WORK_CANCEL_DELAYED = 1 << 0, /* canceling a delayed_work */
+};
+
+enum wq_internal_consts {
NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */
@@ -108,10 +123,18 @@ enum {
RESCUER_NICE_LEVEL = MIN_NICE,
HIGHPRI_NICE_LEVEL = MIN_NICE,
- WQ_NAME_LEN = 24,
+ WQ_NAME_LEN = 32,
};
/*
+ * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and
+ * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because
+ * msecs_to_jiffies() can't be an initializer.
+ */
+#define BH_WORKER_JIFFIES msecs_to_jiffies(2)
+#define BH_WORKER_RESTARTS 10
+
+/*
* Structure fields follow one of the following exclusion rules.
*
* I: Modifiable by initialization/destruction paths and read-only for
@@ -122,6 +145,9 @@ enum {
*
* L: pool->lock protected. Access with pool->lock held.
*
+ * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
+ * reads.
+ *
* K: Only modified by worker while holding pool->lock. Can be safely read by
* self, while holding pool->lock or from IRQ context if %current is the
* kworker.
@@ -143,6 +169,9 @@ enum {
*
* WR: wq->mutex protected for writes. RCU protected for reads.
*
+ * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
+ * with READ_ONCE() without locking.
+ *
* MD: wq_mayday_lock protected.
*
* WD: Used internally by the watchdog.
@@ -219,7 +248,7 @@ enum pool_workqueue_stats {
};
/*
- * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
+ * The per-pool workqueue. While queued, bits below WORK_PWQ_SHIFT
* of work_struct->data are used for flags and the remaining high bits
* point to the pwq; thus, pwqs need to be aligned at two's power of the
* number of flag bits.
@@ -232,6 +261,7 @@ struct pool_workqueue {
int refcnt; /* L: reference count */
int nr_in_flight[WORK_NR_COLORS];
/* L: nr of in_flight works */
+ bool plugged; /* L: execution suspended */
/*
* nr_active management and WORK_STRUCT_INACTIVE:
@@ -240,18 +270,18 @@ struct pool_workqueue {
* pwq->inactive_works instead of pool->worklist and marked with
* WORK_STRUCT_INACTIVE.
*
- * All work items marked with WORK_STRUCT_INACTIVE do not participate
- * in pwq->nr_active and all work items in pwq->inactive_works are
- * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE
- * work items are in pwq->inactive_works. Some of them are ready to
- * run in pool->worklist or worker->scheduled. Those work itmes are
- * only struct wq_barrier which is used for flush_work() and should
- * not participate in pwq->nr_active. For non-barrier work item, it
- * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
+ * All work items marked with WORK_STRUCT_INACTIVE do not participate in
+ * nr_active and all work items in pwq->inactive_works are marked with
+ * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
+ * in pwq->inactive_works. Some of them are ready to run in
+ * pool->worklist or worker->scheduled. Those work itmes are only struct
+ * wq_barrier which is used for flush_work() and should not participate
+ * in nr_active. For non-barrier work item, it is marked with
+ * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
*/
int nr_active; /* L: nr of active works */
- int max_active; /* L: max active works */
struct list_head inactive_works; /* L: inactive works */
+ struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
@@ -265,7 +295,7 @@ struct pool_workqueue {
*/
struct kthread_work release_work;
struct rcu_head rcu;
-} __aligned(1 << WORK_STRUCT_FLAG_BITS);
+} __aligned(1 << WORK_STRUCT_PWQ_SHIFT);
/*
* Structure used to wait for workqueue flush.
@@ -279,6 +309,26 @@ struct wq_flusher {
struct wq_device;
/*
+ * Unlike in a per-cpu workqueue where max_active limits its concurrency level
+ * on each CPU, in an unbound workqueue, max_active applies to the whole system.
+ * As sharing a single nr_active across multiple sockets can be very expensive,
+ * the counting and enforcement is per NUMA node.
+ *
+ * The following struct is used to enforce per-node max_active. When a pwq wants
+ * to start executing a work item, it should increment ->nr using
+ * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
+ * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
+ * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
+ * round-robin order.
+ */
+struct wq_node_nr_active {
+ int max; /* per-node max_active */
+ atomic_t nr; /* per-node nr_active */
+ raw_spinlock_t lock; /* nests inside pool locks */
+ struct list_head pending_pwqs; /* LN: pwqs with inactive works */
+};
+
+/*
* The externally visible workqueue. It relays the issued work items to
* the appropriate worker_pool through its pool_workqueues.
*/
@@ -298,10 +348,15 @@ struct workqueue_struct {
struct worker *rescuer; /* MD: rescue worker */
int nr_drainers; /* WQ: drain in progress */
- int saved_max_active; /* WQ: saved pwq max_active */
+
+ /* See alloc_workqueue() function comment for info on min/max_active */
+ int max_active; /* WO: max active works */
+ int min_active; /* WO: min active works */
+ int saved_max_active; /* WQ: saved max_active */
+ int saved_min_active; /* WQ: saved min_active */
struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
- struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */
+ struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */
#ifdef CONFIG_SYSFS
struct wq_device *wq_dev; /* I: for sysfs interface */
@@ -323,10 +378,9 @@ struct workqueue_struct {
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
+ struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
};
-static struct kmem_cache *pwq_cache;
-
/*
* Each pod type describes how CPUs should be grouped for unbound workqueues.
* See the comment above workqueue_attrs->affn_scope.
@@ -338,16 +392,13 @@ struct wq_pod_type {
int *cpu_pod; /* cpu -> pod */
};
-static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
-static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
-
static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
- [WQ_AFFN_DFL] = "default",
- [WQ_AFFN_CPU] = "cpu",
- [WQ_AFFN_SMT] = "smt",
- [WQ_AFFN_CACHE] = "cache",
- [WQ_AFFN_NUMA] = "numa",
- [WQ_AFFN_SYSTEM] = "system",
+ [WQ_AFFN_DFL] = "default",
+ [WQ_AFFN_CPU] = "cpu",
+ [WQ_AFFN_SMT] = "smt",
+ [WQ_AFFN_CACHE] = "cache",
+ [WQ_AFFN_NUMA] = "numa",
+ [WQ_AFFN_SYSTEM] = "system",
};
/*
@@ -359,12 +410,22 @@ static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
*/
static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
+#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
+static unsigned int wq_cpu_intensive_warning_thresh = 4;
+module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644);
+#endif
/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);
static bool wq_online; /* can kworkers be created yet? */
+static bool wq_topo_initialized __read_mostly = false;
+
+static struct kmem_cache *pwq_cache;
+
+static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
+static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *wq_update_pod_attrs_buf;
@@ -405,8 +466,17 @@ static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
+/* to raise softirq for the BH worker pools on other CPUs */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS],
+ bh_pool_irq_works);
+
+/* the BH worker pools */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
+ bh_worker_pools);
+
/* the per-cpu worker pools */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
+ cpu_worker_pools);
static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
@@ -420,6 +490,12 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
/*
+ * Used to synchronize multiple cancel_sync attempts on the same work item. See
+ * work_grab_pending() and __cancel_work_sync().
+ */
+static DECLARE_WAIT_QUEUE_HEAD(wq_cancel_waitq);
+
+/*
* I: kthread_worker to release pwq's. pwq release needs to be bounced to a
* process context while holding a pool lock. Bounce to a dedicated kthread
* worker to avoid A-A deadlocks.
@@ -440,6 +516,10 @@ struct workqueue_struct *system_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
+struct workqueue_struct *system_bh_wq;
+EXPORT_SYMBOL_GPL(system_bh_wq);
+struct workqueue_struct *system_bh_highpri_wq;
+EXPORT_SYMBOL_GPL(system_bh_highpri_wq);
static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
@@ -450,16 +530,21 @@ static void show_one_worker_pool(struct worker_pool *pool);
#include <trace/events/workqueue.h>
#define assert_rcu_or_pool_mutex() \
- RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \
!lockdep_is_held(&wq_pool_mutex), \
"RCU or wq_pool_mutex should be held")
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
- RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \
!lockdep_is_held(&wq->mutex) && \
!lockdep_is_held(&wq_pool_mutex), \
"RCU, wq->mutex or wq_pool_mutex should be held")
+#define for_each_bh_worker_pool(pool, cpu) \
+ for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \
+ (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
+ (pool)++)
+
#define for_each_cpu_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
(pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -632,6 +717,36 @@ static int worker_pool_assign_id(struct worker_pool *pool)
return ret;
}
+static struct pool_workqueue __rcu **
+unbound_pwq_slot(struct workqueue_struct *wq, int cpu)
+{
+ if (cpu >= 0)
+ return per_cpu_ptr(wq->cpu_pwq, cpu);
+ else
+ return &wq->dfl_pwq;
+}
+
+/* @cpu < 0 for dfl_pwq */
+static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
+{
+ return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
+ lockdep_is_held(&wq_pool_mutex) ||
+ lockdep_is_held(&wq->mutex));
+}
+
+/**
+ * unbound_effective_cpumask - effective cpumask of an unbound workqueue
+ * @wq: workqueue of interest
+ *
+ * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
+ * is masked with wq_unbound_cpumask to determine the effective cpumask. The
+ * default pwq is always mapped to the pool with the current effective cpumask.
+ */
+static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
+{
+ return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
+}
+
static unsigned int work_color_to_flags(int color)
{
return color << WORK_STRUCT_COLOR_SHIFT;
@@ -653,10 +768,9 @@ static int work_next_color(int color)
* contain the pointer to the queued pwq. Once execution starts, the flag
* is cleared and the high bits contain OFFQ flags and pool ID.
*
- * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
- * and clear_work_data() can be used to set the pwq, pool or clear
- * work->data. These functions should only be called while the work is
- * owned - ie. while the PENDING bit is set.
+ * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling()
+ * can be used to set the pwq, pool or clear work->data. These functions should
+ * only be called while the work is owned - ie. while the PENDING bit is set.
*
* get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
* corresponding to a work. Pool is available once the work has been
@@ -668,29 +782,28 @@ static int work_next_color(int color)
* but stay off timer and worklist for arbitrarily long and nobody should
* try to steal the PENDING bit.
*/
-static inline void set_work_data(struct work_struct *work, unsigned long data,
- unsigned long flags)
+static inline void set_work_data(struct work_struct *work, unsigned long data)
{
WARN_ON_ONCE(!work_pending(work));
- atomic_long_set(&work->data, data | flags | work_static(work));
+ atomic_long_set(&work->data, data | work_static(work));
}
static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
- unsigned long extra_flags)
+ unsigned long flags)
{
- set_work_data(work, (unsigned long)pwq,
- WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
+ set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING |
+ WORK_STRUCT_PWQ | flags);
}
static void set_work_pool_and_keep_pending(struct work_struct *work,
- int pool_id)
+ int pool_id, unsigned long flags)
{
- set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
- WORK_STRUCT_PENDING);
+ set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
+ WORK_STRUCT_PENDING | flags);
}
static void set_work_pool_and_clear_pending(struct work_struct *work,
- int pool_id)
+ int pool_id, unsigned long flags)
{
/*
* The following wmb is paired with the implied mb in
@@ -699,7 +812,8 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
* owner.
*/
smp_wmb();
- set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+ set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
+ flags);
/*
* The following mb guarantees that previous clear of a PENDING bit
* will not be reordered with any speculative LOADS or STORES from
@@ -731,15 +845,9 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
smp_mb();
}
-static void clear_work_data(struct work_struct *work)
-{
- smp_wmb(); /* see set_work_pool_and_clear_pending() */
- set_work_data(work, WORK_STRUCT_NO_POOL, 0);
-}
-
static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
{
- return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK);
+ return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK);
}
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
@@ -806,7 +914,7 @@ static void mark_work_canceling(struct work_struct *work)
unsigned long pool_id = get_work_pool_id(work);
pool_id <<= WORK_OFFQ_POOL_SHIFT;
- set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
+ set_work_data(work, pool_id | WORK_STRUCT_PENDING | WORK_OFFQ_CANCELING);
}
static bool work_is_canceling(struct work_struct *work)
@@ -1101,6 +1209,29 @@ static bool assign_work(struct work_struct *work, struct worker *worker,
return true;
}
+static struct irq_work *bh_pool_irq_work(struct worker_pool *pool)
+{
+ int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0;
+
+ return &per_cpu(bh_pool_irq_works, pool->cpu)[high];
+}
+
+static void kick_bh_pool(struct worker_pool *pool)
+{
+#ifdef CONFIG_SMP
+ /* see drain_dead_softirq_workfn() for BH_DRAINING */
+ if (unlikely(pool->cpu != smp_processor_id() &&
+ !(pool->flags & POOL_BH_DRAINING))) {
+ irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu);
+ return;
+ }
+#endif
+ if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
+ raise_softirq_irqoff(HI_SOFTIRQ);
+ else
+ raise_softirq_irqoff(TASKLET_SOFTIRQ);
+}
+
/**
* kick_pool - wake up an idle worker if necessary
* @pool: pool to kick
@@ -1118,6 +1249,11 @@ static bool kick_pool(struct worker_pool *pool)
if (!need_more_worker(pool) || !worker)
return false;
+ if (pool->flags & POOL_BH) {
+ kick_bh_pool(pool);
+ return true;
+ }
+
p = worker->task;
#ifdef CONFIG_SMP
@@ -1198,11 +1334,13 @@ restart:
u64 cnt;
/*
- * Start reporting from the fourth time and back off
+ * Start reporting from the warning_thresh and back off
* exponentially.
*/
cnt = atomic64_inc_return_relaxed(&ent->cnt);
- if (cnt >= 4 && is_power_of_2(cnt))
+ if (wq_cpu_intensive_warning_thresh &&
+ cnt >= wq_cpu_intensive_warning_thresh &&
+ is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh))
printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
ent->func, wq_cpu_intensive_thresh_us,
atomic64_read(&ent->cnt));
@@ -1231,10 +1369,12 @@ restart:
ent = &wci_ents[wci_nr_ents++];
ent->func = func;
- atomic64_set(&ent->cnt, 1);
+ atomic64_set(&ent->cnt, 0);
hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);
raw_spin_unlock(&wci_lock);
+
+ goto restart;
}
#else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
@@ -1402,6 +1542,74 @@ work_func_t wq_worker_last_func(struct task_struct *task)
}
/**
+ * wq_node_nr_active - Determine wq_node_nr_active to use
+ * @wq: workqueue of interest
+ * @node: NUMA node, can be %NUMA_NO_NODE
+ *
+ * Determine wq_node_nr_active to use for @wq on @node. Returns:
+ *
+ * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
+ *
+ * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
+ *
+ * - Otherwise, node_nr_active[@node].
+ */
+static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
+ int node)
+{
+ if (!(wq->flags & WQ_UNBOUND))
+ return NULL;
+
+ if (node == NUMA_NO_NODE)
+ node = nr_node_ids;
+
+ return wq->node_nr_active[node];
+}
+
+/**
+ * wq_update_node_max_active - Update per-node max_actives to use
+ * @wq: workqueue to update
+ * @off_cpu: CPU that's going down, -1 if a CPU is not going down
+ *
+ * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
+ * distributed among nodes according to the proportions of numbers of online
+ * cpus. The result is always between @wq->min_active and max_active.
+ */
+static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
+{
+ struct cpumask *effective = unbound_effective_cpumask(wq);
+ int min_active = READ_ONCE(wq->min_active);
+ int max_active = READ_ONCE(wq->max_active);
+ int total_cpus, node;
+
+ lockdep_assert_held(&wq->mutex);
+
+ if (!wq_topo_initialized)
+ return;
+
+ if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective))
+ off_cpu = -1;
+
+ total_cpus = cpumask_weight_and(effective, cpu_online_mask);
+ if (off_cpu >= 0)
+ total_cpus--;
+
+ for_each_node(node) {
+ int node_cpus;
+
+ node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
+ if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
+ node_cpus--;
+
+ wq_node_nr_active(wq, node)->max =
+ clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
+ min_active, max_active);
+ }
+
+ wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active;
+}
+
+/**
* get_pwq - get an extra reference on the specified pool_workqueue
* @pwq: pool_workqueue to get
*
@@ -1453,24 +1661,336 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
}
}
-static void pwq_activate_inactive_work(struct work_struct *work)
+static bool pwq_is_empty(struct pool_workqueue *pwq)
{
- struct pool_workqueue *pwq = get_work_pwq(work);
+ return !pwq->nr_active && list_empty(&pwq->inactive_works);
+}
+static void __pwq_activate_work(struct pool_workqueue *pwq,
+ struct work_struct *work)
+{
+ unsigned long *wdb = work_data_bits(work);
+
+ WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
trace_workqueue_activate_work(work);
if (list_empty(&pwq->pool->worklist))
pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
- __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work));
+ __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
+}
+
+/**
+ * pwq_activate_work - Activate a work item if inactive
+ * @pwq: pool_workqueue @work belongs to
+ * @work: work item to activate
+ *
+ * Returns %true if activated. %false if already active.
+ */
+static bool pwq_activate_work(struct pool_workqueue *pwq,
+ struct work_struct *work)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct wq_node_nr_active *nna;
+
+ lockdep_assert_held(&pool->lock);
+
+ if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
+ return false;
+
+ nna = wq_node_nr_active(pwq->wq, pool->node);
+ if (nna)
+ atomic_inc(&nna->nr);
+
pwq->nr_active++;
+ __pwq_activate_work(pwq, work);
+ return true;
+}
+
+static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
+{
+ int max = READ_ONCE(nna->max);
+
+ while (true) {
+ int old, tmp;
+
+ old = atomic_read(&nna->nr);
+ if (old >= max)
+ return false;
+ tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
+ if (tmp == old)
+ return true;
+ }
+}
+
+/**
+ * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
+ * @pwq: pool_workqueue of interest
+ * @fill: max_active may have increased, try to increase concurrency level
+ *
+ * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
+ * successfully obtained. %false otherwise.
+ */
+static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
+{
+ struct workqueue_struct *wq = pwq->wq;
+ struct worker_pool *pool = pwq->pool;
+ struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
+ bool obtained = false;
+
+ lockdep_assert_held(&pool->lock);
+
+ if (!nna) {
+ /* BH or per-cpu workqueue, pwq->nr_active is sufficient */
+ obtained = pwq->nr_active < READ_ONCE(wq->max_active);
+ goto out;
+ }
+
+ if (unlikely(pwq->plugged))
+ return false;
+
+ /*
+ * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
+ * already waiting on $nna, pwq_dec_nr_active() will maintain the
+ * concurrency level. Don't jump the line.
+ *
+ * We need to ignore the pending test after max_active has increased as
+ * pwq_dec_nr_active() can only maintain the concurrency level but not
+ * increase it. This is indicated by @fill.
+ */
+ if (!list_empty(&pwq->pending_node) && likely(!fill))
+ goto out;
+
+ obtained = tryinc_node_nr_active(nna);
+ if (obtained)
+ goto out;
+
+ /*
+ * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
+ * and try again. The smp_mb() is paired with the implied memory barrier
+ * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
+ * we see the decremented $nna->nr or they see non-empty
+ * $nna->pending_pwqs.
+ */
+ raw_spin_lock(&nna->lock);
+
+ if (list_empty(&pwq->pending_node))
+ list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
+ else if (likely(!fill))
+ goto out_unlock;
+
+ smp_mb();
+
+ obtained = tryinc_node_nr_active(nna);
+
+ /*
+ * If @fill, @pwq might have already been pending. Being spuriously
+ * pending in cold paths doesn't affect anything. Let's leave it be.
+ */
+ if (obtained && likely(!fill))
+ list_del_init(&pwq->pending_node);
+
+out_unlock:
+ raw_spin_unlock(&nna->lock);
+out:
+ if (obtained)
+ pwq->nr_active++;
+ return obtained;
+}
+
+/**
+ * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
+ * @pwq: pool_workqueue of interest
+ * @fill: max_active may have increased, try to increase concurrency level
+ *
+ * Activate the first inactive work item of @pwq if available and allowed by
+ * max_active limit.
+ *
+ * Returns %true if an inactive work item has been activated. %false if no
+ * inactive work item is found or max_active limit is reached.
+ */
+static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
+{
+ struct work_struct *work =
+ list_first_entry_or_null(&pwq->inactive_works,
+ struct work_struct, entry);
+
+ if (work && pwq_tryinc_nr_active(pwq, fill)) {
+ __pwq_activate_work(pwq, work);
+ return true;
+ } else {
+ return false;
+ }
}
-static void pwq_activate_first_inactive(struct pool_workqueue *pwq)
+/**
+ * unplug_oldest_pwq - unplug the oldest pool_workqueue
+ * @wq: workqueue_struct where its oldest pwq is to be unplugged
+ *
+ * This function should only be called for ordered workqueues where only the
+ * oldest pwq is unplugged, the others are plugged to suspend execution to
+ * ensure proper work item ordering::
+ *
+ * dfl_pwq --------------+ [P] - plugged
+ * |
+ * v
+ * pwqs -> A -> B [P] -> C [P] (newest)
+ * | | |
+ * 1 3 5
+ * | | |
+ * 2 4 6
+ *
+ * When the oldest pwq is drained and removed, this function should be called
+ * to unplug the next oldest one to start its work item execution. Note that
+ * pwq's are linked into wq->pwqs with the oldest first, so the first one in
+ * the list is the oldest.
+ */
+static void unplug_oldest_pwq(struct workqueue_struct *wq)
{
- struct work_struct *work = list_first_entry(&pwq->inactive_works,
- struct work_struct, entry);
+ struct pool_workqueue *pwq;
- pwq_activate_inactive_work(work);
+ lockdep_assert_held(&wq->mutex);
+
+ /* Caller should make sure that pwqs isn't empty before calling */
+ pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue,
+ pwqs_node);
+ raw_spin_lock_irq(&pwq->pool->lock);
+ if (pwq->plugged) {
+ pwq->plugged = false;
+ if (pwq_activate_first_inactive(pwq, true))
+ kick_pool(pwq->pool);
+ }
+ raw_spin_unlock_irq(&pwq->pool->lock);
+}
+
+/**
+ * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
+ * @nna: wq_node_nr_active to activate a pending pwq for
+ * @caller_pool: worker_pool the caller is locking
+ *
+ * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
+ * @caller_pool may be unlocked and relocked to lock other worker_pools.
+ */
+static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
+ struct worker_pool *caller_pool)
+{
+ struct worker_pool *locked_pool = caller_pool;
+ struct pool_workqueue *pwq;
+ struct work_struct *work;
+
+ lockdep_assert_held(&caller_pool->lock);
+
+ raw_spin_lock(&nna->lock);
+retry:
+ pwq = list_first_entry_or_null(&nna->pending_pwqs,
+ struct pool_workqueue, pending_node);
+ if (!pwq)
+ goto out_unlock;
+
+ /*
+ * If @pwq is for a different pool than @locked_pool, we need to lock
+ * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
+ * / lock dance. For that, we also need to release @nna->lock as it's
+ * nested inside pool locks.
+ */
+ if (pwq->pool != locked_pool) {
+ raw_spin_unlock(&locked_pool->lock);
+ locked_pool = pwq->pool;
+ if (!raw_spin_trylock(&locked_pool->lock)) {
+ raw_spin_unlock(&nna->lock);
+ raw_spin_lock(&locked_pool->lock);
+ raw_spin_lock(&nna->lock);
+ goto retry;
+ }
+ }
+
+ /*
+ * $pwq may not have any inactive work items due to e.g. cancellations.
+ * Drop it from pending_pwqs and see if there's another one.
+ */
+ work = list_first_entry_or_null(&pwq->inactive_works,
+ struct work_struct, entry);
+ if (!work) {
+ list_del_init(&pwq->pending_node);
+ goto retry;
+ }
+
+ /*
+ * Acquire an nr_active count and activate the inactive work item. If
+ * $pwq still has inactive work items, rotate it to the end of the
+ * pending_pwqs so that we round-robin through them. This means that
+ * inactive work items are not activated in queueing order which is fine
+ * given that there has never been any ordering across different pwqs.
+ */
+ if (likely(tryinc_node_nr_active(nna))) {
+ pwq->nr_active++;
+ __pwq_activate_work(pwq, work);
+
+ if (list_empty(&pwq->inactive_works))
+ list_del_init(&pwq->pending_node);
+ else
+ list_move_tail(&pwq->pending_node, &nna->pending_pwqs);
+
+ /* if activating a foreign pool, make sure it's running */
+ if (pwq->pool != caller_pool)
+ kick_pool(pwq->pool);
+ }
+
+out_unlock:
+ raw_spin_unlock(&nna->lock);
+ if (locked_pool != caller_pool) {
+ raw_spin_unlock(&locked_pool->lock);
+ raw_spin_lock(&caller_pool->lock);
+ }
+}
+
+/**
+ * pwq_dec_nr_active - Retire an active count
+ * @pwq: pool_workqueue of interest
+ *
+ * Decrement @pwq's nr_active and try to activate the first inactive work item.
+ * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
+ */
+static void pwq_dec_nr_active(struct pool_workqueue *pwq)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);
+
+ lockdep_assert_held(&pool->lock);
+
+ /*
+ * @pwq->nr_active should be decremented for both percpu and unbound
+ * workqueues.
+ */
+ pwq->nr_active--;
+
+ /*
+ * For a percpu workqueue, it's simple. Just need to kick the first
+ * inactive work item on @pwq itself.
+ */
+ if (!nna) {
+ pwq_activate_first_inactive(pwq, false);
+ return;
+ }
+
+ /*
+ * If @pwq is for an unbound workqueue, it's more complicated because
+ * multiple pwqs and pools may be sharing the nr_active count. When a
+ * pwq needs to wait for an nr_active count, it puts itself on
+ * $nna->pending_pwqs. The following atomic_dec_return()'s implied
+ * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
+ * guarantee that either we see non-empty pending_pwqs or they see
+ * decremented $nna->nr.
+ *
+ * $nna->max may change as CPUs come online/offline and @pwq->wq's
+ * max_active gets updated. However, it is guaranteed to be equal to or
+ * larger than @pwq->wq->min_active which is above zero unless freezing.
+ * This maintains the forward progress guarantee.
+ */
+ if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
+ return;
+
+ if (!list_empty(&nna->pending_pwqs))
+ node_activate_pending_pwq(nna, pool);
}
/**
@@ -1481,6 +2001,11 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq)
* A work either has completed or is removed from pending queue,
* decrement nr_in_flight of its pwq and handle workqueue flushing.
*
+ * NOTE:
+ * For unbound workqueues, this function may temporarily drop @pwq->pool->lock
+ * and thus should be called after all other state updates for the in-flight
+ * work item is complete.
+ *
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
@@ -1488,14 +2013,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_
{
int color = get_work_color(work_data);
- if (!(work_data & WORK_STRUCT_INACTIVE)) {
- pwq->nr_active--;
- if (!list_empty(&pwq->inactive_works)) {
- /* one down, submit an inactive one */
- if (pwq->nr_active < pwq->max_active)
- pwq_activate_first_inactive(pwq);
- }
- }
+ if (!(work_data & WORK_STRUCT_INACTIVE))
+ pwq_dec_nr_active(pwq);
pwq->nr_in_flight[color]--;
@@ -1523,8 +2042,8 @@ out_put:
/**
* try_to_grab_pending - steal work item from worklist and disable irq
* @work: work item to steal
- * @is_dwork: @work is a delayed_work
- * @flags: place to store irq state
+ * @cflags: %WORK_CANCEL_ flags
+ * @irq_flags: place to store irq state
*
* Try to grab PENDING bit of @work. This function can handle @work in any
* stable state - idle, on timer or on worklist.
@@ -1546,20 +2065,20 @@ out_put:
* irqsafe, ensures that we return -EAGAIN for finite short period of time.
*
* On successful return, >= 0, irq is disabled and the caller is
- * responsible for releasing it using local_irq_restore(*@flags).
+ * responsible for releasing it using local_irq_restore(*@irq_flags).
*
* This function is safe to call from any context including IRQ handler.
*/
-static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
- unsigned long *flags)
+static int try_to_grab_pending(struct work_struct *work, u32 cflags,
+ unsigned long *irq_flags)
{
struct worker_pool *pool;
struct pool_workqueue *pwq;
- local_irq_save(*flags);
+ local_irq_save(*irq_flags);
/* try to steal the timer if it exists */
- if (is_dwork) {
+ if (cflags & WORK_CANCEL_DELAYED) {
struct delayed_work *dwork = to_delayed_work(work);
/*
@@ -1595,6 +2114,8 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
*/
pwq = get_work_pwq(work);
if (pwq && pwq->pool == pool) {
+ unsigned long work_data;
+
debug_work_deactivate(work);
/*
@@ -1608,14 +2129,19 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
* management later on and cause stall. Make sure the work
* item is activated before grabbing.
*/
- if (*work_data_bits(work) & WORK_STRUCT_INACTIVE)
- pwq_activate_inactive_work(work);
+ pwq_activate_work(pwq, work);
list_del_init(&work->entry);
- pwq_dec_nr_in_flight(pwq, *work_data_bits(work));
- /* work->data points to pwq iff queued, point to pool */
- set_work_pool_and_keep_pending(work, pool->id);
+ /*
+ * work->data points to pwq iff queued. Let's point to pool. As
+ * this destroys work->data needed by the next step, stash it.
+ */
+ work_data = *work_data_bits(work);
+ set_work_pool_and_keep_pending(work, pool->id, 0);
+
+ /* must be the last step, see the function comment */
+ pwq_dec_nr_in_flight(pwq, work_data);
raw_spin_unlock(&pool->lock);
rcu_read_unlock();
@@ -1624,13 +2150,82 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
raw_spin_unlock(&pool->lock);
fail:
rcu_read_unlock();
- local_irq_restore(*flags);
+ local_irq_restore(*irq_flags);
if (work_is_canceling(work))
return -ENOENT;
cpu_relax();
return -EAGAIN;
}
+struct cwt_wait {
+ wait_queue_entry_t wait;
+ struct work_struct *work;
+};
+
+static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+{
+ struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
+
+ if (cwait->work != key)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
+/**
+ * work_grab_pending - steal work item from worklist and disable irq
+ * @work: work item to steal
+ * @cflags: %WORK_CANCEL_ flags
+ * @irq_flags: place to store IRQ state
+ *
+ * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer
+ * or on worklist.
+ *
+ * Must be called in process context. IRQ is disabled on return with IRQ state
+ * stored in *@irq_flags. The caller is responsible for re-enabling it using
+ * local_irq_restore().
+ *
+ * Returns %true if @work was pending. %false if idle.
+ */
+static bool work_grab_pending(struct work_struct *work, u32 cflags,
+ unsigned long *irq_flags)
+{
+ struct cwt_wait cwait;
+ int ret;
+
+ might_sleep();
+repeat:
+ ret = try_to_grab_pending(work, cflags, irq_flags);
+ if (likely(ret >= 0))
+ return ret;
+ if (ret != -ENOENT)
+ goto repeat;
+
+ /*
+ * Someone is already canceling. Wait for it to finish. flush_work()
+ * doesn't work for PREEMPT_NONE because we may get woken up between
+ * @work's completion and the other canceling task resuming and clearing
+ * CANCELING - flush_work() will return false immediately as @work is no
+ * longer busy, try_to_grab_pending() will return -ENOENT as @work is
+ * still being canceled and the other canceling task won't be able to
+ * clear CANCELING as we're hogging the CPU.
+ *
+ * Let's wait for completion using a waitqueue. As this may lead to the
+ * thundering herd problem, use a custom wake function which matches
+ * @work along with exclusive wait and wakeup.
+ */
+ init_wait(&cwait.wait);
+ cwait.wait.func = cwt_wakefn;
+ cwait.work = work;
+
+ prepare_to_wait_exclusive(&wq_cancel_waitq, &cwait.wait,
+ TASK_UNINTERRUPTIBLE);
+ if (work_is_canceling(work))
+ schedule();
+ finish_wait(&wq_cancel_waitq, &cwait.wait);
+
+ goto repeat;
+}
+
/**
* insert_work - insert a work into a pool
* @pwq: pwq @work belongs to
@@ -1718,7 +2313,6 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
*/
lockdep_assert_irqs_disabled();
-
/*
* For a draining wq, only works from the same workqueue are
* allowed. The __WQ_DESTROYING helps to spot the issue that
@@ -1793,12 +2387,16 @@ retry:
pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);
- if (likely(pwq->nr_active < pwq->max_active)) {
+ /*
+ * Limit the number of concurrently active work items to max_active.
+ * @work must also queue behind existing inactive work items to maintain
+ * ordering when max_active changes. See wq_adjust_max_active().
+ */
+ if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
if (list_empty(&pool->worklist))
pool->watchdog_ts = jiffies;
trace_workqueue_activate_work(work);
- pwq->nr_active++;
insert_work(pwq, work, &pool->worklist, work_flags);
kick_pool(pool);
} else {
@@ -1829,16 +2427,16 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
bool ret = false;
- unsigned long flags;
+ unsigned long irq_flags;
- local_irq_save(flags);
+ local_irq_save(irq_flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_work(cpu, wq, work);
ret = true;
}
- local_irq_restore(flags);
+ local_irq_restore(irq_flags);
return ret;
}
EXPORT_SYMBOL(queue_work_on);
@@ -1895,7 +2493,7 @@ static int select_numa_node_cpu(int node)
bool queue_work_node(int node, struct workqueue_struct *wq,
struct work_struct *work)
{
- unsigned long flags;
+ unsigned long irq_flags;
bool ret = false;
/*
@@ -1909,7 +2507,7 @@ bool queue_work_node(int node, struct workqueue_struct *wq,
*/
WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));
- local_irq_save(flags);
+ local_irq_save(irq_flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
int cpu = select_numa_node_cpu(node);
@@ -1918,7 +2516,7 @@ bool queue_work_node(int node, struct workqueue_struct *wq,
ret = true;
}
- local_irq_restore(flags);
+ local_irq_restore(irq_flags);
return ret;
}
EXPORT_SYMBOL_GPL(queue_work_node);
@@ -1958,10 +2556,18 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
dwork->cpu = cpu;
timer->expires = jiffies + delay;
- if (unlikely(cpu != WORK_CPU_UNBOUND))
+ if (housekeeping_enabled(HK_TYPE_TIMER)) {
+ /* If the current cpu is a housekeeping cpu, use it. */
+ cpu = smp_processor_id();
+ if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER))
+ cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
add_timer_on(timer, cpu);
- else
- add_timer(timer);
+ } else {
+ if (likely(cpu == WORK_CPU_UNBOUND))
+ add_timer(timer);
+ else
+ add_timer_on(timer, cpu);
+ }
}
/**
@@ -1980,17 +2586,17 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
{
struct work_struct *work = &dwork->work;
bool ret = false;
- unsigned long flags;
+ unsigned long irq_flags;
/* read the comment in __queue_work() */
- local_irq_save(flags);
+ local_irq_save(irq_flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_delayed_work(cpu, wq, dwork, delay);
ret = true;
}
- local_irq_restore(flags);
+ local_irq_restore(irq_flags);
return ret;
}
EXPORT_SYMBOL(queue_delayed_work_on);
@@ -2016,16 +2622,17 @@ EXPORT_SYMBOL(queue_delayed_work_on);
bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
struct delayed_work *dwork, unsigned long delay)
{
- unsigned long flags;
+ unsigned long irq_flags;
int ret;
do {
- ret = try_to_grab_pending(&dwork->work, true, &flags);
+ ret = try_to_grab_pending(&dwork->work, WORK_CANCEL_DELAYED,
+ &irq_flags);
} while (unlikely(ret == -EAGAIN));
if (likely(ret >= 0)) {
__queue_delayed_work(cpu, wq, dwork, delay);
- local_irq_restore(flags);
+ local_irq_restore(irq_flags);
}
/* -ENOENT from try_to_grab_pending() becomes %true */
@@ -2100,19 +2707,21 @@ static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
* cpu-[un]hotplugs.
*/
static void worker_attach_to_pool(struct worker *worker,
- struct worker_pool *pool)
+ struct worker_pool *pool)
{
mutex_lock(&wq_pool_attach_mutex);
/*
- * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
- * stable across this function. See the comments above the flag
- * definition for details.
+ * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable
+ * across this function. See the comments above the flag definition for
+ * details. BH workers are, while per-CPU, always DISASSOCIATED.
*/
- if (pool->flags & POOL_DISASSOCIATED)
+ if (pool->flags & POOL_DISASSOCIATED) {
worker->flags |= WORKER_UNBOUND;
- else
+ } else {
+ WARN_ON_ONCE(pool->flags & POOL_BH);
kthread_set_per_cpu(worker->task, pool->cpu);
+ }
if (worker->rescue_wq)
set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));
@@ -2136,6 +2745,9 @@ static void worker_detach_from_pool(struct worker *worker)
struct worker_pool *pool = worker->pool;
struct completion *detach_completion = NULL;
+ /* there is one permanent BH worker per CPU which should never detach */
+ WARN_ON_ONCE(pool->flags & POOL_BH);
+
mutex_lock(&wq_pool_attach_mutex);
kthread_set_per_cpu(worker->task, -1);
@@ -2187,27 +2799,29 @@ static struct worker *create_worker(struct worker_pool *pool)
worker->id = id;
- if (pool->cpu >= 0)
- snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
- pool->attrs->nice < 0 ? "H" : "");
- else
- snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
-
- worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
- "kworker/%s", id_buf);
- if (IS_ERR(worker->task)) {
- if (PTR_ERR(worker->task) == -EINTR) {
- pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n",
- id_buf);
- } else {
- pr_err_once("workqueue: Failed to create a worker thread: %pe",
- worker->task);
+ if (!(pool->flags & POOL_BH)) {
+ if (pool->cpu >= 0)
+ snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
+ pool->attrs->nice < 0 ? "H" : "");
+ else
+ snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
+
+ worker->task = kthread_create_on_node(worker_thread, worker,
+ pool->node, "kworker/%s", id_buf);
+ if (IS_ERR(worker->task)) {
+ if (PTR_ERR(worker->task) == -EINTR) {
+ pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n",
+ id_buf);
+ } else {
+ pr_err_once("workqueue: Failed to create a worker thread: %pe",
+ worker->task);
+ }
+ goto fail;
}
- goto fail;
- }
- set_user_nice(worker->task, pool->attrs->nice);
- kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
+ set_user_nice(worker->task, pool->attrs->nice);
+ kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
+ }
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
@@ -2217,14 +2831,14 @@ static struct worker *create_worker(struct worker_pool *pool)
worker->pool->nr_workers++;
worker_enter_idle(worker);
- kick_pool(pool);
/*
* @worker is waiting on a completion in kthread() and will trigger hung
- * check if not woken up soon. As kick_pool() might not have waken it
- * up, wake it up explicitly once more.
+ * check if not woken up soon. As kick_pool() is noop if @pool is empty,
+ * wake it up explicitly.
*/
- wake_up_process(worker->task);
+ if (worker->task)
+ wake_up_process(worker->task);
raw_spin_unlock_irq(&pool->lock);
@@ -2543,6 +3157,8 @@ __acquires(&pool->lock)
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
unsigned long work_data;
+ int lockdep_start_depth, rcu_start_depth;
+ bool bh_draining = pool->flags & POOL_BH_DRAINING;
#ifdef CONFIG_LOCKDEP
/*
* It is permissible to free the struct work_struct from
@@ -2565,7 +3181,8 @@ __acquires(&pool->lock)
worker->current_work = work;
worker->current_func = work->func;
worker->current_pwq = pwq;
- worker->current_at = worker->task->se.sum_exec_runtime;
+ if (worker->task)
+ worker->current_at = worker->task->se.sum_exec_runtime;
work_data = *work_data_bits(work);
worker->current_color = get_work_color(work_data);
@@ -2600,12 +3217,16 @@ __acquires(&pool->lock)
* PENDING and queued state changes happen together while IRQ is
* disabled.
*/
- set_work_pool_and_clear_pending(work, pool->id);
+ set_work_pool_and_clear_pending(work, pool->id, 0);
pwq->stats[PWQ_STAT_STARTED]++;
raw_spin_unlock_irq(&pool->lock);
- lock_map_acquire(&pwq->wq->lockdep_map);
+ rcu_start_depth = rcu_preempt_depth();
+ lockdep_start_depth = lockdep_depth(current);
+ /* see drain_dead_softirq_workfn() */
+ if (!bh_draining)
+ lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
/*
* Strictly speaking we should mark the invariant state without holding
@@ -2638,12 +3259,17 @@ __acquires(&pool->lock)
trace_workqueue_execute_end(work, worker->current_func);
pwq->stats[PWQ_STAT_COMPLETED]++;
lock_map_release(&lockdep_map);
- lock_map_release(&pwq->wq->lockdep_map);
+ if (!bh_draining)
+ lock_map_release(&pwq->wq->lockdep_map);
- if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
- pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
- " last function: %ps\n",
- current->comm, preempt_count(), task_pid_nr(current),
+ if (unlikely((worker->task && in_atomic()) ||
+ lockdep_depth(current) != lockdep_start_depth ||
+ rcu_preempt_depth() != rcu_start_depth)) {
+ pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n"
+ " preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n",
+ current->comm, task_pid_nr(current), preempt_count(),
+ lockdep_start_depth, lockdep_depth(current),
+ rcu_start_depth, rcu_preempt_depth(),
worker->current_func);
debug_show_held_locks(current);
dump_stack();
@@ -2657,7 +3283,8 @@ __acquires(&pool->lock)
* stop_machine. At the same time, report a quiescent RCU state so
* the same condition doesn't freeze RCU.
*/
- cond_resched();
+ if (worker->task)
+ cond_resched();
raw_spin_lock_irq(&pool->lock);
@@ -2677,6 +3304,8 @@ __acquires(&pool->lock)
worker->current_func = NULL;
worker->current_pwq = NULL;
worker->current_color = INT_MAX;
+
+ /* must be the last step, see the function comment */
pwq_dec_nr_in_flight(pwq, work_data);
}
@@ -2938,6 +3567,139 @@ repeat:
goto repeat;
}
+static void bh_worker(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+ int nr_restarts = BH_WORKER_RESTARTS;
+ unsigned long end = jiffies + BH_WORKER_JIFFIES;
+
+ raw_spin_lock_irq(&pool->lock);
+ worker_leave_idle(worker);
+
+ /*
+ * This function follows the structure of worker_thread(). See there for
+ * explanations on each step.
+ */
+ if (!need_more_worker(pool))
+ goto done;
+
+ WARN_ON_ONCE(!list_empty(&worker->scheduled));
+ worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
+
+ do {
+ struct work_struct *work =
+ list_first_entry(&pool->worklist,
+ struct work_struct, entry);
+
+ if (assign_work(work, worker, NULL))
+ process_scheduled_works(worker);
+ } while (keep_working(pool) &&
+ --nr_restarts && time_before(jiffies, end));
+
+ worker_set_flags(worker, WORKER_PREP);
+done:
+ worker_enter_idle(worker);
+ kick_pool(pool);
+ raw_spin_unlock_irq(&pool->lock);
+}
+
+/*
+ * TODO: Convert all tasklet users to workqueue and use softirq directly.
+ *
+ * This is currently called from tasklet[_hi]action() and thus is also called
+ * whenever there are tasklets to run. Let's do an early exit if there's nothing
+ * queued. Once conversion from tasklet is complete, the need_more_worker() test
+ * can be dropped.
+ *
+ * After full conversion, we'll add worker->softirq_action, directly use the
+ * softirq action and obtain the worker pointer from the softirq_action pointer.
+ */
+void workqueue_softirq_action(bool highpri)
+{
+ struct worker_pool *pool =
+ &per_cpu(bh_worker_pools, smp_processor_id())[highpri];
+ if (need_more_worker(pool))
+ bh_worker(list_first_entry(&pool->workers, struct worker, node));
+}
+
+struct wq_drain_dead_softirq_work {
+ struct work_struct work;
+ struct worker_pool *pool;
+ struct completion done;
+};
+
+static void drain_dead_softirq_workfn(struct work_struct *work)
+{
+ struct wq_drain_dead_softirq_work *dead_work =
+ container_of(work, struct wq_drain_dead_softirq_work, work);
+ struct worker_pool *pool = dead_work->pool;
+ bool repeat;
+
+ /*
+ * @pool's CPU is dead and we want to execute its still pending work
+ * items from this BH work item which is running on a different CPU. As
+ * its CPU is dead, @pool can't be kicked and, as work execution path
+ * will be nested, a lockdep annotation needs to be suppressed. Mark
+ * @pool with %POOL_BH_DRAINING for the special treatments.
+ */
+ raw_spin_lock_irq(&pool->lock);
+ pool->flags |= POOL_BH_DRAINING;
+ raw_spin_unlock_irq(&pool->lock);
+
+ bh_worker(list_first_entry(&pool->workers, struct worker, node));
+
+ raw_spin_lock_irq(&pool->lock);
+ pool->flags &= ~POOL_BH_DRAINING;
+ repeat = need_more_worker(pool);
+ raw_spin_unlock_irq(&pool->lock);
+
+ /*
+ * bh_worker() might hit consecutive execution limit and bail. If there
+ * still are pending work items, reschedule self and return so that we
+ * don't hog this CPU's BH.
+ */
+ if (repeat) {
+ if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
+ queue_work(system_bh_highpri_wq, work);
+ else
+ queue_work(system_bh_wq, work);
+ } else {
+ complete(&dead_work->done);
+ }
+}
+
+/*
+ * @cpu is dead. Drain the remaining BH work items on the current CPU. It's
+ * possible to allocate dead_work per CPU and avoid flushing. However, then we
+ * have to worry about draining overlapping with CPU coming back online or
+ * nesting (one CPU's dead_work queued on another CPU which is also dead and so
+ * on). Let's keep it simple and drain them synchronously. These are BH work
+ * items which shouldn't be requeued on the same pool. Shouldn't take long.
+ */
+void workqueue_softirq_dead(unsigned int cpu)
+{
+ int i;
+
+ for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
+ struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
+ struct wq_drain_dead_softirq_work dead_work;
+
+ if (!need_more_worker(pool))
+ continue;
+
+ INIT_WORK(&dead_work.work, drain_dead_softirq_workfn);
+ dead_work.pool = pool;
+ init_completion(&dead_work.done);
+
+ if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
+ queue_work(system_bh_highpri_wq, &dead_work.work);
+ else
+ queue_work(system_bh_wq, &dead_work.work);
+
+ wait_for_completion(&dead_work.done);
+ }
+}
+
/**
* check_flush_dependency - check for flush dependency sanity
* @target_wq: workqueue being flushed
@@ -3010,6 +3772,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
struct wq_barrier *barr,
struct work_struct *target, struct worker *worker)
{
+ static __maybe_unused struct lock_class_key bh_key, thr_key;
unsigned int work_flags = 0;
unsigned int work_color;
struct list_head *head;
@@ -3019,15 +3782,20 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
* as we know for sure that this will not trigger any of the
* checks and call back into the fixup functions where we
* might deadlock.
+ *
+ * BH and threaded workqueues need separate lockdep keys to avoid
+ * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W}
+ * usage".
*/
- INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
+ INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func,
+ (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
init_completion_map(&barr->done, &target->lockdep_map);
barr->task = current;
- /* The barrier work item does not participate in pwq->nr_active. */
+ /* The barrier work item does not participate in nr_active. */
work_flags |= WORK_STRUCT_INACTIVE;
/*
@@ -3124,6 +3892,35 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
return wait;
}
+static void touch_wq_lockdep_map(struct workqueue_struct *wq)
+{
+#ifdef CONFIG_LOCKDEP
+ if (wq->flags & WQ_BH)
+ local_bh_disable();
+
+ lock_map_acquire(&wq->lockdep_map);
+ lock_map_release(&wq->lockdep_map);
+
+ if (wq->flags & WQ_BH)
+ local_bh_enable();
+#endif
+}
+
+static void touch_work_lockdep_map(struct work_struct *work,
+ struct workqueue_struct *wq)
+{
+#ifdef CONFIG_LOCKDEP
+ if (wq->flags & WQ_BH)
+ local_bh_disable();
+
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);
+
+ if (wq->flags & WQ_BH)
+ local_bh_enable();
+#endif
+}
+
/**
* __flush_workqueue - ensure that any scheduled work has run to completion.
* @wq: workqueue to flush
@@ -3143,8 +3940,7 @@ void __flush_workqueue(struct workqueue_struct *wq)
if (WARN_ON(!wq_online))
return;
- lock_map_acquire(&wq->lockdep_map);
- lock_map_release(&wq->lockdep_map);
+ touch_wq_lockdep_map(wq);
mutex_lock(&wq->mutex);
@@ -3316,7 +4112,7 @@ reflush:
bool drained;
raw_spin_lock_irq(&pwq->pool->lock);
- drained = !pwq->nr_active && list_empty(&pwq->inactive_works);
+ drained = pwq_is_empty(pwq);
raw_spin_unlock_irq(&pwq->pool->lock);
if (drained)
@@ -3343,6 +4139,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
struct worker *worker = NULL;
struct worker_pool *pool;
struct pool_workqueue *pwq;
+ struct workqueue_struct *wq;
might_sleep();
@@ -3366,11 +4163,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
pwq = worker->current_pwq;
}
- check_flush_dependency(pwq->wq, work);
+ wq = pwq->wq;
+ check_flush_dependency(wq, work);
insert_wq_barrier(pwq, barr, work, worker);
raw_spin_unlock_irq(&pool->lock);
+ touch_work_lockdep_map(work, wq);
+
/*
* Force a lock recursion deadlock when using flush_work() inside a
* single-threaded or rescuer equipped workqueue.
@@ -3380,11 +4180,9 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
* workqueues the deadlock happens when the rescuer stalls, blocking
* forward progress.
*/
- if (!from_cancel &&
- (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
- lock_map_acquire(&pwq->wq->lockdep_map);
- lock_map_release(&pwq->wq->lockdep_map);
- }
+ if (!from_cancel && (wq->saved_max_active == 1 || wq->rescuer))
+ touch_wq_lockdep_map(wq);
+
rcu_read_unlock();
return true;
already_gone:
@@ -3403,9 +4201,6 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
if (WARN_ON(!work->func))
return false;
- lock_map_acquire(&work->lockdep_map);
- lock_map_release(&work->lockdep_map);
-
if (start_flush_work(work, &barr, from_cancel)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
@@ -3432,108 +4227,6 @@ bool flush_work(struct work_struct *work)
}
EXPORT_SYMBOL_GPL(flush_work);
-struct cwt_wait {
- wait_queue_entry_t wait;
- struct work_struct *work;
-};
-
-static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
-{
- struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
-
- if (cwait->work != key)
- return 0;
- return autoremove_wake_function(wait, mode, sync, key);
-}
-
-static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
-{
- static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
- unsigned long flags;
- int ret;
-
- do {
- ret = try_to_grab_pending(work, is_dwork, &flags);
- /*
- * If someone else is already canceling, wait for it to
- * finish. flush_work() doesn't work for PREEMPT_NONE
- * because we may get scheduled between @work's completion
- * and the other canceling task resuming and clearing
- * CANCELING - flush_work() will return false immediately
- * as @work is no longer busy, try_to_grab_pending() will
- * return -ENOENT as @work is still being canceled and the
- * other canceling task won't be able to clear CANCELING as
- * we're hogging the CPU.
- *
- * Let's wait for completion using a waitqueue. As this
- * may lead to the thundering herd problem, use a custom
- * wake function which matches @work along with exclusive
- * wait and wakeup.
- */
- if (unlikely(ret == -ENOENT)) {
- struct cwt_wait cwait;
-
- init_wait(&cwait.wait);
- cwait.wait.func = cwt_wakefn;
- cwait.work = work;
-
- prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
- TASK_UNINTERRUPTIBLE);
- if (work_is_canceling(work))
- schedule();
- finish_wait(&cancel_waitq, &cwait.wait);
- }
- } while (unlikely(ret < 0));
-
- /* tell other tasks trying to grab @work to back off */
- mark_work_canceling(work);
- local_irq_restore(flags);
-
- /*
- * This allows canceling during early boot. We know that @work
- * isn't executing.
- */
- if (wq_online)
- __flush_work(work, true);
-
- clear_work_data(work);
-
- /*
- * Paired with prepare_to_wait() above so that either
- * waitqueue_active() is visible here or !work_is_canceling() is
- * visible there.
- */
- smp_mb();
- if (waitqueue_active(&cancel_waitq))
- __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
-
- return ret;
-}
-
-/**
- * cancel_work_sync - cancel a work and wait for it to finish
- * @work: the work to cancel
- *
- * Cancel @work and wait for its execution to finish. This function
- * can be used even if the work re-queues itself or migrates to
- * another workqueue. On return from this function, @work is
- * guaranteed to be not pending or executing on any CPU.
- *
- * cancel_work_sync(&delayed_work->work) must not be used for
- * delayed_work's. Use cancel_delayed_work_sync() instead.
- *
- * The caller must ensure that the workqueue on which @work was last
- * queued can't be destroyed before this function returns.
- *
- * Return:
- * %true if @work was pending, %false otherwise.
- */
-bool cancel_work_sync(struct work_struct *work)
-{
- return __cancel_work_timer(work, false);
-}
-EXPORT_SYMBOL_GPL(cancel_work_sync);
-
/**
* flush_delayed_work - wait for a dwork to finish executing the last queueing
* @dwork: the delayed work to flush
@@ -3576,20 +4269,50 @@ bool flush_rcu_work(struct rcu_work *rwork)
}
EXPORT_SYMBOL(flush_rcu_work);
-static bool __cancel_work(struct work_struct *work, bool is_dwork)
+static bool __cancel_work(struct work_struct *work, u32 cflags)
{
- unsigned long flags;
+ unsigned long irq_flags;
int ret;
do {
- ret = try_to_grab_pending(work, is_dwork, &flags);
+ ret = try_to_grab_pending(work, cflags, &irq_flags);
} while (unlikely(ret == -EAGAIN));
if (unlikely(ret < 0))
return false;
- set_work_pool_and_clear_pending(work, get_work_pool_id(work));
- local_irq_restore(flags);
+ set_work_pool_and_clear_pending(work, get_work_pool_id(work), 0);
+ local_irq_restore(irq_flags);
+ return ret;
+}
+
+static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
+{
+ unsigned long irq_flags;
+ bool ret;
+
+ /* claim @work and tell other tasks trying to grab @work to back off */
+ ret = work_grab_pending(work, cflags, &irq_flags);
+ mark_work_canceling(work);
+ local_irq_restore(irq_flags);
+
+ /*
+ * Skip __flush_work() during early boot when we know that @work isn't
+ * executing. This allows canceling during early boot.
+ */
+ if (wq_online)
+ __flush_work(work, true);
+
+ /*
+ * smp_mb() at the end of set_work_pool_and_clear_pending() is paired
+ * with prepare_to_wait() above so that either waitqueue_active() is
+ * visible here or !work_is_canceling() is visible there.
+ */
+ set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE, 0);
+
+ if (waitqueue_active(&wq_cancel_waitq))
+ __wake_up(&wq_cancel_waitq, TASK_NORMAL, 1, work);
+
return ret;
}
@@ -3598,11 +4321,35 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
*/
bool cancel_work(struct work_struct *work)
{
- return __cancel_work(work, false);
+ return __cancel_work(work, 0);
}
EXPORT_SYMBOL(cancel_work);
/**
+ * cancel_work_sync - cancel a work and wait for it to finish
+ * @work: the work to cancel
+ *
+ * Cancel @work and wait for its execution to finish. This function
+ * can be used even if the work re-queues itself or migrates to
+ * another workqueue. On return from this function, @work is
+ * guaranteed to be not pending or executing on any CPU.
+ *
+ * cancel_work_sync(&delayed_work->work) must not be used for
+ * delayed_work's. Use cancel_delayed_work_sync() instead.
+ *
+ * The caller must ensure that the workqueue on which @work was last
+ * queued can't be destroyed before this function returns.
+ *
+ * Return:
+ * %true if @work was pending, %false otherwise.
+ */
+bool cancel_work_sync(struct work_struct *work)
+{
+ return __cancel_work_sync(work, 0);
+}
+EXPORT_SYMBOL_GPL(cancel_work_sync);
+
+/**
* cancel_delayed_work - cancel a delayed work
* @dwork: delayed_work to cancel
*
@@ -3620,7 +4367,7 @@ EXPORT_SYMBOL(cancel_work);
*/
bool cancel_delayed_work(struct delayed_work *dwork)
{
- return __cancel_work(&dwork->work, true);
+ return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work);
@@ -3635,7 +4382,7 @@ EXPORT_SYMBOL(cancel_delayed_work);
*/
bool cancel_delayed_work_sync(struct delayed_work *dwork)
{
- return __cancel_work_timer(&dwork->work, true);
+ return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work_sync);
@@ -3927,11 +4674,66 @@ static void wq_free_lockdep(struct workqueue_struct *wq)
}
#endif
+static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
+{
+ int node;
+
+ for_each_node(node) {
+ kfree(nna_ar[node]);
+ nna_ar[node] = NULL;
+ }
+
+ kfree(nna_ar[nr_node_ids]);
+ nna_ar[nr_node_ids] = NULL;
+}
+
+static void init_node_nr_active(struct wq_node_nr_active *nna)
+{
+ nna->max = WQ_DFL_MIN_ACTIVE;
+ atomic_set(&nna->nr, 0);
+ raw_spin_lock_init(&nna->lock);
+ INIT_LIST_HEAD(&nna->pending_pwqs);
+}
+
+/*
+ * Each node's nr_active counter will be accessed mostly from its own node and
+ * should be allocated in the node.
+ */
+static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
+{
+ struct wq_node_nr_active *nna;
+ int node;
+
+ for_each_node(node) {
+ nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
+ if (!nna)
+ goto err_free;
+ init_node_nr_active(nna);
+ nna_ar[node] = nna;
+ }
+
+ /* [nr_node_ids] is used as the fallback */
+ nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
+ if (!nna)
+ goto err_free;
+ init_node_nr_active(nna);
+ nna_ar[nr_node_ids] = nna;
+
+ return 0;
+
+err_free:
+ free_node_nr_active(nna_ar);
+ return -ENOMEM;
+}
+
static void rcu_free_wq(struct rcu_head *rcu)
{
struct workqueue_struct *wq =
container_of(rcu, struct workqueue_struct, rcu);
+ if (wq->flags & WQ_UNBOUND)
+ free_node_nr_active(wq->node_nr_active);
+
wq_free_lockdep(wq);
free_percpu(wq->cpu_pwq);
free_workqueue_attrs(wq->unbound_attrs);
@@ -4121,6 +4923,13 @@ static void pwq_release_workfn(struct kthread_work *work)
mutex_lock(&wq->mutex);
list_del_rcu(&pwq->pwqs_node);
is_last = list_empty(&wq->pwqs);
+
+ /*
+ * For ordered workqueue with a plugged dfl_pwq, restart it now.
+ */
+ if (!is_last && (wq->flags & __WQ_ORDERED))
+ unplug_oldest_pwq(wq);
+
mutex_unlock(&wq->mutex);
}
@@ -4130,6 +4939,15 @@ static void pwq_release_workfn(struct kthread_work *work)
mutex_unlock(&wq_pool_mutex);
}
+ if (!list_empty(&pwq->pending_node)) {
+ struct wq_node_nr_active *nna =
+ wq_node_nr_active(pwq->wq, pwq->pool->node);
+
+ raw_spin_lock_irq(&nna->lock);
+ list_del_init(&pwq->pending_node);
+ raw_spin_unlock_irq(&nna->lock);
+ }
+
call_rcu(&pwq->rcu, rcu_free_pwq);
/*
@@ -4142,55 +4960,11 @@ static void pwq_release_workfn(struct kthread_work *work)
}
}
-/**
- * pwq_adjust_max_active - update a pwq's max_active to the current setting
- * @pwq: target pool_workqueue
- *
- * If @pwq isn't freezing, set @pwq->max_active to the associated
- * workqueue's saved_max_active and activate inactive work items
- * accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
- */
-static void pwq_adjust_max_active(struct pool_workqueue *pwq)
-{
- struct workqueue_struct *wq = pwq->wq;
- bool freezable = wq->flags & WQ_FREEZABLE;
- unsigned long flags;
-
- /* for @wq->saved_max_active */
- lockdep_assert_held(&wq->mutex);
-
- /* fast exit for non-freezable wqs */
- if (!freezable && pwq->max_active == wq->saved_max_active)
- return;
-
- /* this function can be called during early boot w/ irq disabled */
- raw_spin_lock_irqsave(&pwq->pool->lock, flags);
-
- /*
- * During [un]freezing, the caller is responsible for ensuring that
- * this function is called at least once after @workqueue_freezing
- * is updated and visible.
- */
- if (!freezable || !workqueue_freezing) {
- pwq->max_active = wq->saved_max_active;
-
- while (!list_empty(&pwq->inactive_works) &&
- pwq->nr_active < pwq->max_active)
- pwq_activate_first_inactive(pwq);
-
- kick_pool(pwq->pool);
- } else {
- pwq->max_active = 0;
- }
-
- raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
-}
-
/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
struct worker_pool *pool)
{
- BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
+ BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK);
memset(pwq, 0, sizeof(*pwq));
@@ -4199,6 +4973,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
pwq->flush_color = -1;
pwq->refcnt = 1;
INIT_LIST_HEAD(&pwq->inactive_works);
+ INIT_LIST_HEAD(&pwq->pending_node);
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
kthread_init_work(&pwq->release_work, pwq_release_workfn);
@@ -4218,11 +4993,8 @@ static void link_pwq(struct pool_workqueue *pwq)
/* set the matching work_color */
pwq->work_color = wq->work_color;
- /* sync max_active to the current setting */
- pwq_adjust_max_active(pwq);
-
/* link in @pwq */
- list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
+ list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs);
}
/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
@@ -4289,10 +5061,11 @@ static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
"possible intersect\n");
}
-/* install @pwq into @wq's cpu_pwq and return the old pwq */
+/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
int cpu, struct pool_workqueue *pwq)
{
+ struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
struct pool_workqueue *old_pwq;
lockdep_assert_held(&wq_pool_mutex);
@@ -4301,8 +5074,8 @@ static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
/* link_pwq() can handle duplicate calls */
link_pwq(pwq);
- old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
- rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq);
+ old_pwq = rcu_access_pointer(*slot);
+ rcu_assign_pointer(*slot, pwq);
return old_pwq;
}
@@ -4383,6 +5156,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
ctx->attrs = new_attrs;
+ /*
+ * For initialized ordered workqueues, there should only be one pwq
+ * (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution
+ * of newly queued work items until execution of older work items in
+ * the old pwq's have completed.
+ */
+ if ((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))
+ ctx->dfl_pwq->plugged = true;
+
ctx->wq = wq;
return ctx;
@@ -4402,14 +5184,19 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
- /* save the previous pwq and install the new one */
+ /* save the previous pwqs and install the new ones */
for_each_possible_cpu(cpu)
ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
ctx->pwq_tbl[cpu]);
+ ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);
+
+ /* update node_nr_active->max */
+ wq_update_node_max_active(ctx->wq, -1);
- /* @dfl_pwq might not have been used, ensure it's linked */
- link_pwq(ctx->dfl_pwq);
- swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
+ /* rescuer needs to respect wq cpumask changes */
+ if (ctx->wq->rescuer)
+ set_cpus_allowed_ptr(ctx->wq->rescuer->task,
+ unbound_effective_cpumask(ctx->wq));
mutex_unlock(&ctx->wq->mutex);
}
@@ -4423,14 +5210,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
return -EINVAL;
- /* creating multiple pwqs breaks ordering guarantee */
- if (!list_empty(&wq->pwqs)) {
- if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
- return -EINVAL;
-
- wq->flags &= ~__WQ_ORDERED;
- }
-
ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
@@ -4519,9 +5298,7 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
/* nothing to do if the target cpumask matches the current pwq */
wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
- pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu),
- lockdep_is_held(&wq_pool_mutex));
- if (wqattrs_equal(target_attrs, pwq->pool->attrs))
+ if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
return;
/* create a new pwq */
@@ -4539,10 +5316,11 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
use_dfl_pwq:
mutex_lock(&wq->mutex);
- raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
- get_pwq(wq->dfl_pwq);
- raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
- old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq);
+ pwq = unbound_pwq(wq, -1);
+ raw_spin_lock_irq(&pwq->pool->lock);
+ get_pwq(pwq);
+ raw_spin_unlock_irq(&pwq->pool->lock);
+ old_pwq = install_unbound_pwq(wq, cpu, pwq);
out_unlock:
mutex_unlock(&wq->mutex);
put_pwq_unlocked(old_pwq);
@@ -4559,10 +5337,17 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
if (!(wq->flags & WQ_UNBOUND)) {
for_each_possible_cpu(cpu) {
- struct pool_workqueue **pwq_p =
- per_cpu_ptr(wq->cpu_pwq, cpu);
- struct worker_pool *pool =
- &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]);
+ struct pool_workqueue **pwq_p;
+ struct worker_pool __percpu *pools;
+ struct worker_pool *pool;
+
+ if (wq->flags & WQ_BH)
+ pools = bh_worker_pools;
+ else
+ pools = cpu_worker_pools;
+
+ pool = &(per_cpu_ptr(pools, cpu)[highpri]);
+ pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);
*pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
pool->node);
@@ -4580,10 +5365,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
cpus_read_lock();
if (wq->flags & __WQ_ORDERED) {
+ struct pool_workqueue *dfl_pwq;
+
ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
- WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
- wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+ dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
+ WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
+ wq->pwqs.prev != &dfl_pwq->pwqs_node),
"ordering guarantee broken for workqueue %s\n", wq->name);
} else {
ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
@@ -4652,12 +5440,78 @@ static int init_rescuer(struct workqueue_struct *wq)
}
wq->rescuer = rescuer;
- kthread_bind_mask(rescuer->task, cpu_possible_mask);
+ if (wq->flags & WQ_UNBOUND)
+ kthread_bind_mask(rescuer->task, wq_unbound_cpumask);
+ else
+ kthread_bind_mask(rescuer->task, cpu_possible_mask);
wake_up_process(rescuer->task);
return 0;
}
+/**
+ * wq_adjust_max_active - update a wq's max_active to the current setting
+ * @wq: target workqueue
+ *
+ * If @wq isn't freezing, set @wq->max_active to the saved_max_active and
+ * activate inactive work items accordingly. If @wq is freezing, clear
+ * @wq->max_active to zero.
+ */
+static void wq_adjust_max_active(struct workqueue_struct *wq)
+{
+ bool activated;
+ int new_max, new_min;
+
+ lockdep_assert_held(&wq->mutex);
+
+ if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
+ new_max = 0;
+ new_min = 0;
+ } else {
+ new_max = wq->saved_max_active;
+ new_min = wq->saved_min_active;
+ }
+
+ if (wq->max_active == new_max && wq->min_active == new_min)
+ return;
+
+ /*
+ * Update @wq->max/min_active and then kick inactive work items if more
+ * active work items are allowed. This doesn't break work item ordering
+ * because new work items are always queued behind existing inactive
+ * work items if there are any.
+ */
+ WRITE_ONCE(wq->max_active, new_max);
+ WRITE_ONCE(wq->min_active, new_min);
+
+ if (wq->flags & WQ_UNBOUND)
+ wq_update_node_max_active(wq, -1);
+
+ if (new_max == 0)
+ return;
+
+ /*
+ * Round-robin through pwq's activating the first inactive work item
+ * until max_active is filled.
+ */
+ do {
+ struct pool_workqueue *pwq;
+
+ activated = false;
+ for_each_pwq(pwq, wq) {
+ unsigned long irq_flags;
+
+ /* can be called during early boot w/ irq disabled */
+ raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
+ if (pwq_activate_first_inactive(pwq, true)) {
+ activated = true;
+ kick_pool(pwq->pool);
+ }
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
+ }
+ } while (activated);
+}
+
__printf(1, 4)
struct workqueue_struct *alloc_workqueue(const char *fmt,
unsigned int flags,
@@ -4665,23 +5519,27 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
{
va_list args;
struct workqueue_struct *wq;
- struct pool_workqueue *pwq;
+ size_t wq_size;
+ int name_len;
- /*
- * Unbound && max_active == 1 used to imply ordered, which is no longer
- * the case on many machines due to per-pod pools. While
- * alloc_ordered_workqueue() is the right way to create an ordered
- * workqueue, keep the previous behavior to avoid subtle breakages.
- */
- if ((flags & WQ_UNBOUND) && max_active == 1)
- flags |= __WQ_ORDERED;
+ if (flags & WQ_BH) {
+ if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS))
+ return NULL;
+ if (WARN_ON_ONCE(max_active))
+ return NULL;
+ }
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
flags |= WQ_UNBOUND;
/* allocate wq and format name */
- wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+ if (flags & WQ_UNBOUND)
+ wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
+ else
+ wq_size = sizeof(*wq);
+
+ wq = kzalloc(wq_size, GFP_KERNEL);
if (!wq)
return NULL;
@@ -4692,15 +5550,30 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
}
va_start(args, max_active);
- vsnprintf(wq->name, sizeof(wq->name), fmt, args);
+ name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
va_end(args);
- max_active = max_active ?: WQ_DFL_ACTIVE;
- max_active = wq_clamp_max_active(max_active, flags, wq->name);
+ if (name_len >= WQ_NAME_LEN)
+ pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
+ wq->name);
+
+ if (flags & WQ_BH) {
+ /*
+ * BH workqueues always share a single execution context per CPU
+ * and don't impose any max_active limit.
+ */
+ max_active = INT_MAX;
+ } else {
+ max_active = max_active ?: WQ_DFL_ACTIVE;
+ max_active = wq_clamp_max_active(max_active, flags, wq->name);
+ }
/* init wq */
wq->flags = flags;
- wq->saved_max_active = max_active;
+ wq->max_active = max_active;
+ wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
+ wq->saved_max_active = wq->max_active;
+ wq->saved_min_active = wq->min_active;
mutex_init(&wq->mutex);
atomic_set(&wq->nr_pwqs_to_flush, 0);
INIT_LIST_HEAD(&wq->pwqs);
@@ -4711,8 +5584,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
wq_init_lockdep(wq);
INIT_LIST_HEAD(&wq->list);
+ if (flags & WQ_UNBOUND) {
+ if (alloc_node_nr_active(wq->node_nr_active) < 0)
+ goto err_unreg_lockdep;
+ }
+
if (alloc_and_link_pwqs(wq) < 0)
- goto err_unreg_lockdep;
+ goto err_free_node_nr_active;
if (wq_online && init_rescuer(wq) < 0)
goto err_destroy;
@@ -4728,8 +5606,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
mutex_lock(&wq_pool_mutex);
mutex_lock(&wq->mutex);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
list_add_tail_rcu(&wq->list, &workqueues);
@@ -4738,6 +5615,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
return wq;
+err_free_node_nr_active:
+ if (wq->flags & WQ_UNBOUND)
+ free_node_nr_active(wq->node_nr_active);
err_unreg_lockdep:
wq_unregister_lockdep(wq);
wq_free_lockdep(wq);
@@ -4759,9 +5639,9 @@ static bool pwq_busy(struct pool_workqueue *pwq)
if (pwq->nr_in_flight[i])
return true;
- if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1))
+ if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))
return true;
- if (pwq->nr_active || !list_empty(&pwq->inactive_works))
+ if (!pwq_is_empty(pwq))
return true;
return false;
@@ -4843,13 +5723,12 @@ void destroy_workqueue(struct workqueue_struct *wq)
rcu_read_lock();
for_each_possible_cpu(cpu) {
- pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
- RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL);
- put_pwq_unlocked(pwq);
+ put_pwq_unlocked(unbound_pwq(wq, cpu));
+ RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
}
- put_pwq_unlocked(wq->dfl_pwq);
- wq->dfl_pwq = NULL;
+ put_pwq_unlocked(unbound_pwq(wq, -1));
+ RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);
rcu_read_unlock();
}
@@ -4860,34 +5739,63 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
* @wq: target workqueue
* @max_active: new max_active value.
*
- * Set max_active of @wq to @max_active.
+ * Set max_active of @wq to @max_active. See the alloc_workqueue() function
+ * comment.
*
* CONTEXT:
* Don't call from IRQ context.
*/
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
- struct pool_workqueue *pwq;
-
+ /* max_active doesn't mean anything for BH workqueues */
+ if (WARN_ON(wq->flags & WQ_BH))
+ return;
/* disallow meddling with max_active for ordered workqueues */
- if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
return;
max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
mutex_lock(&wq->mutex);
- wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;
+ if (wq->flags & WQ_UNBOUND)
+ wq->saved_min_active = min(wq->saved_min_active, max_active);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(workqueue_set_max_active);
/**
+ * workqueue_set_min_active - adjust min_active of an unbound workqueue
+ * @wq: target unbound workqueue
+ * @min_active: new min_active value
+ *
+ * Set min_active of an unbound workqueue. Unlike other types of workqueues, an
+ * unbound workqueue is not guaranteed to be able to process max_active
+ * interdependent work items. Instead, an unbound workqueue is guaranteed to be
+ * able to process min_active number of interdependent work items which is
+ * %WQ_DFL_MIN_ACTIVE by default.
+ *
+ * Use this function to adjust the min_active value between 0 and the current
+ * max_active.
+ */
+void workqueue_set_min_active(struct workqueue_struct *wq, int min_active)
+{
+ /* min_active is only meaningful for non-ordered unbound workqueues */
+ if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) !=
+ WQ_UNBOUND))
+ return;
+
+ mutex_lock(&wq->mutex);
+ wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active);
+ wq_adjust_max_active(wq);
+ mutex_unlock(&wq->mutex);
+}
+
+/**
* current_work - retrieve %current task's work struct
*
* Determine if %current task is a workqueue worker and what it's working on.
@@ -4972,7 +5880,7 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
unsigned int work_busy(struct work_struct *work)
{
struct worker_pool *pool;
- unsigned long flags;
+ unsigned long irq_flags;
unsigned int ret = 0;
if (work_pending(work))
@@ -4981,10 +5889,10 @@ unsigned int work_busy(struct work_struct *work)
rcu_read_lock();
pool = get_work_pool(work);
if (pool) {
- raw_spin_lock_irqsave(&pool->lock, flags);
+ raw_spin_lock_irqsave(&pool->lock, irq_flags);
if (find_worker_executing_work(pool, work))
ret |= WORK_BUSY_RUNNING;
- raw_spin_unlock_irqrestore(&pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
}
rcu_read_unlock();
@@ -5069,7 +5977,24 @@ static void pr_cont_pool_info(struct worker_pool *pool)
pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
if (pool->node != NUMA_NO_NODE)
pr_cont(" node=%d", pool->node);
- pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+ pr_cont(" flags=0x%x", pool->flags);
+ if (pool->flags & POOL_BH)
+ pr_cont(" bh%s",
+ pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
+ else
+ pr_cont(" nice=%d", pool->attrs->nice);
+}
+
+static void pr_cont_worker_id(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+
+ if (pool->flags & WQ_BH)
+ pr_cont("bh%s",
+ pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
+ else
+ pr_cont("%d%s", task_pid_nr(worker->task),
+ worker->rescue_wq ? "(RESCUER)" : "");
}
struct pr_cont_work_struct {
@@ -5128,8 +6053,8 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_info(" pwq %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" active=%d/%d refcnt=%d%s\n",
- pwq->nr_active, pwq->max_active, pwq->refcnt,
+ pr_cont(" active=%d refcnt=%d%s\n",
+ pwq->nr_active, pwq->refcnt,
!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
@@ -5146,10 +6071,9 @@ static void show_pwq(struct pool_workqueue *pwq)
if (worker->current_pwq != pwq)
continue;
- pr_cont("%s %d%s:%ps", comma ? "," : "",
- task_pid_nr(worker->task),
- worker->rescue_wq ? "(RESCUER)" : "",
- worker->current_func);
+ pr_cont(" %s", comma ? "," : "");
+ pr_cont_worker_id(worker);
+ pr_cont(":%ps", worker->current_func);
list_for_each_entry(work, &worker->scheduled, entry)
pr_cont_work(false, work, &pcws);
pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
@@ -5200,10 +6124,10 @@ void show_one_workqueue(struct workqueue_struct *wq)
{
struct pool_workqueue *pwq;
bool idle = true;
- unsigned long flags;
+ unsigned long irq_flags;
for_each_pwq(pwq, wq) {
- if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ if (!pwq_is_empty(pwq)) {
idle = false;
break;
}
@@ -5214,8 +6138,8 @@ void show_one_workqueue(struct workqueue_struct *wq)
pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
for_each_pwq(pwq, wq) {
- raw_spin_lock_irqsave(&pwq->pool->lock, flags);
- if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
+ if (!pwq_is_empty(pwq)) {
/*
* Defer printing to avoid deadlocks in console
* drivers that queue work while holding locks
@@ -5225,7 +6149,7 @@ void show_one_workqueue(struct workqueue_struct *wq)
show_pwq(pwq);
printk_deferred_exit();
}
- raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
/*
* We could be printing a lot from atomic context, e.g.
* sysrq-t -> show_all_workqueues(). Avoid triggering
@@ -5244,10 +6168,10 @@ static void show_one_worker_pool(struct worker_pool *pool)
{
struct worker *worker;
bool first = true;
- unsigned long flags;
+ unsigned long irq_flags;
unsigned long hung = 0;
- raw_spin_lock_irqsave(&pool->lock, flags);
+ raw_spin_lock_irqsave(&pool->lock, irq_flags);
if (pool->nr_workers == pool->nr_idle)
goto next_pool;
@@ -5268,14 +6192,14 @@ static void show_one_worker_pool(struct worker_pool *pool)
pr_cont(" manager: %d",
task_pid_nr(pool->manager->task));
list_for_each_entry(worker, &pool->idle_list, entry) {
- pr_cont(" %s%d", first ? "idle: " : "",
- task_pid_nr(worker->task));
+ pr_cont(" %s", first ? "idle: " : "");
+ pr_cont_worker_id(worker);
first = false;
}
pr_cont("\n");
printk_deferred_exit();
next_pool:
- raw_spin_unlock_irqrestore(&pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
/*
* We could be printing a lot from atomic context, e.g.
* sysrq-t -> show_all_workqueues(). Avoid triggering
@@ -5542,13 +6466,15 @@ int workqueue_online_cpu(unsigned int cpu)
mutex_lock(&wq_pool_mutex);
for_each_pool(pool, pi) {
- mutex_lock(&wq_pool_attach_mutex);
+ /* BH pools aren't affected by hotplug */
+ if (pool->flags & POOL_BH)
+ continue;
+ mutex_lock(&wq_pool_attach_mutex);
if (pool->cpu == cpu)
rebind_workers(pool);
else if (pool->cpu < 0)
restore_unbound_workers_cpumask(pool, cpu);
-
mutex_unlock(&wq_pool_attach_mutex);
}
@@ -5562,6 +6488,10 @@ int workqueue_online_cpu(unsigned int cpu)
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
wq_update_pod(wq, tcpu, cpu, true);
+
+ mutex_lock(&wq->mutex);
+ wq_update_node_max_active(wq, -1);
+ mutex_unlock(&wq->mutex);
}
}
@@ -5590,6 +6520,10 @@ int workqueue_offline_cpu(unsigned int cpu)
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
wq_update_pod(wq, tcpu, cpu, false);
+
+ mutex_lock(&wq->mutex);
+ wq_update_node_max_active(wq, cpu);
+ mutex_unlock(&wq->mutex);
}
}
mutex_unlock(&wq_pool_mutex);
@@ -5677,7 +6611,6 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
void freeze_workqueues_begin(void)
{
struct workqueue_struct *wq;
- struct pool_workqueue *pwq;
mutex_lock(&wq_pool_mutex);
@@ -5686,8 +6619,7 @@ void freeze_workqueues_begin(void)
list_for_each_entry(wq, &workqueues, list) {
mutex_lock(&wq->mutex);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
}
@@ -5752,7 +6684,6 @@ out_unlock:
void thaw_workqueues(void)
{
struct workqueue_struct *wq;
- struct pool_workqueue *pwq;
mutex_lock(&wq_pool_mutex);
@@ -5764,8 +6695,7 @@ void thaw_workqueues(void)
/* restore max_active and repopulate worklist */
list_for_each_entry(wq, &workqueues, list) {
mutex_lock(&wq->mutex);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
}
@@ -5784,16 +6714,9 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
lockdep_assert_held(&wq_pool_mutex);
list_for_each_entry(wq, &workqueues, list) {
- if (!(wq->flags & WQ_UNBOUND))
+ if (!(wq->flags & WQ_UNBOUND) || (wq->flags & __WQ_DESTROYING))
continue;
- /* creating multiple pwqs breaks ordering guarantee */
- if (!list_empty(&wq->pwqs)) {
- if (wq->flags & __WQ_ORDERED_EXPLICIT)
- continue;
- wq->flags &= ~__WQ_ORDERED;
- }
-
ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
if (IS_ERR(ctx)) {
ret = PTR_ERR(ctx);
@@ -6307,11 +7230,10 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
int ret;
/*
- * Adjusting max_active or creating new pwqs by applying
- * attributes breaks ordering guarantee. Disallow exposing ordered
- * workqueues.
+ * Adjusting max_active breaks ordering guarantee. Disallow exposing
+ * ordered workqueues.
*/
- if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
return -EINVAL;
wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
@@ -6408,10 +7330,10 @@ static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
static void show_cpu_pool_hog(struct worker_pool *pool)
{
struct worker *worker;
- unsigned long flags;
+ unsigned long irq_flags;
int bkt;
- raw_spin_lock_irqsave(&pool->lock, flags);
+ raw_spin_lock_irqsave(&pool->lock, irq_flags);
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
if (task_is_running(worker->task)) {
@@ -6429,7 +7351,7 @@ static void show_cpu_pool_hog(struct worker_pool *pool)
}
}
- raw_spin_unlock_irqrestore(&pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
}
static void show_cpu_pools_hogs(void)
@@ -6501,7 +7423,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
/* did we stall? */
if (time_after(now, ts + thresh)) {
lockup_detected = true;
- if (pool->cpu >= 0) {
+ if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
pool->cpu_stall = true;
cpu_pool_stall = true;
}
@@ -6584,6 +7506,16 @@ static inline void wq_watchdog_init(void) { }
#endif /* CONFIG_WQ_WATCHDOG */
+static void bh_pool_kick_normal(struct irq_work *irq_work)
+{
+ raise_softirq_irqoff(TASKLET_SOFTIRQ);
+}
+
+static void bh_pool_kick_highpri(struct irq_work *irq_work)
+{
+ raise_softirq_irqoff(HI_SOFTIRQ);
+}
+
static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
{
if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
@@ -6595,6 +7527,22 @@ static void __init restrict_unbound_cpumask(const char *name, const struct cpuma
cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
}
+static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int nice)
+{
+ BUG_ON(init_worker_pool(pool));
+ pool->cpu = cpu;
+ cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
+ cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
+ pool->attrs->nice = nice;
+ pool->attrs->affn_strict = true;
+ pool->node = cpu_to_node(cpu);
+
+ /* alloc pool ID */
+ mutex_lock(&wq_pool_mutex);
+ BUG_ON(worker_pool_assign_id(pool));
+ mutex_unlock(&wq_pool_mutex);
+}
+
/**
* workqueue_init_early - early init for workqueue subsystem
*
@@ -6609,6 +7557,8 @@ void __init workqueue_init_early(void)
{
struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+ void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
+ bh_pool_kick_highpri };
int i, cpu;
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
@@ -6630,6 +7580,13 @@ void __init workqueue_init_early(void)
wq_update_pod_attrs_buf = alloc_workqueue_attrs();
BUG_ON(!wq_update_pod_attrs_buf);
+ /*
+ * If nohz_full is enabled, set power efficient workqueue as unbound.
+ * This allows workqueue items to be moved to HK CPUs.
+ */
+ if (housekeeping_enabled(HK_TYPE_TICK))
+ wq_power_efficient = true;
+
/* initialize WQ_AFFN_SYSTEM pods */
pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
@@ -6643,25 +7600,21 @@ void __init workqueue_init_early(void)
pt->pod_node[0] = NUMA_NO_NODE;
pt->cpu_pod[0] = 0;
- /* initialize CPU pools */
+ /* initialize BH and CPU pools */
for_each_possible_cpu(cpu) {
struct worker_pool *pool;
i = 0;
- for_each_cpu_worker_pool(pool, cpu) {
- BUG_ON(init_worker_pool(pool));
- pool->cpu = cpu;
- cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
- cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
- pool->attrs->nice = std_nice[i++];
- pool->attrs->affn_strict = true;
- pool->node = cpu_to_node(cpu);
-
- /* alloc pool ID */
- mutex_lock(&wq_pool_mutex);
- BUG_ON(worker_pool_assign_id(pool));
- mutex_unlock(&wq_pool_mutex);
+ for_each_bh_worker_pool(pool, cpu) {
+ init_cpu_worker_pool(pool, cpu, std_nice[i]);
+ pool->flags |= POOL_BH;
+ init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]);
+ i++;
}
+
+ i = 0;
+ for_each_cpu_worker_pool(pool, cpu)
+ init_cpu_worker_pool(pool, cpu, std_nice[i++]);
}
/* create default unbound and ordered wq attrs */
@@ -6691,13 +7644,17 @@ void __init workqueue_init_early(void)
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
WQ_POWER_EFFICIENT, 0);
- system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
+ system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
WQ_FREEZABLE | WQ_POWER_EFFICIENT,
0);
+ system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
+ system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
+ WQ_BH | WQ_HIGHPRI, 0);
BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
!system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq ||
- !system_freezable_power_efficient_wq);
+ !system_freezable_power_efficient_wq ||
+ !system_bh_wq || !system_bh_highpri_wq);
}
static void __init wq_cpu_intensive_thresh_init(void)
@@ -6763,9 +7720,10 @@ void __init workqueue_init(void)
* up. Also, create a rescuer for workqueues that requested it.
*/
for_each_possible_cpu(cpu) {
- for_each_cpu_worker_pool(pool, cpu) {
+ for_each_bh_worker_pool(pool, cpu)
+ pool->node = cpu_to_node(cpu);
+ for_each_cpu_worker_pool(pool, cpu)
pool->node = cpu_to_node(cpu);
- }
}
list_for_each_entry(wq, &workqueues, list) {
@@ -6776,7 +7734,16 @@ void __init workqueue_init(void)
mutex_unlock(&wq_pool_mutex);
- /* create the initial workers */
+ /*
+ * Create the initial workers. A BH pool has one pseudo worker that
+ * represents the shared BH execution context and thus doesn't get
+ * affected by hotplug events. Create the BH pseudo workers for all
+ * possible CPUs here.
+ */
+ for_each_possible_cpu(cpu)
+ for_each_bh_worker_pool(pool, cpu)
+ BUG_ON(!create_worker(pool));
+
for_each_online_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
@@ -6856,7 +7823,7 @@ static bool __init cpus_share_numa(int cpu0, int cpu1)
/**
* workqueue_init_topology - initialize CPU pods for unbound workqueues
*
- * This is the third step of there-staged workqueue subsystem initialization and
+ * This is the third step of three-staged workqueue subsystem initialization and
* invoked after SMP and topology information are fully initialized. It
* initializes the unbound CPU pods accordingly.
*/
@@ -6870,6 +7837,8 @@ void __init workqueue_init_topology(void)
init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);
+ wq_topo_initialized = true;
+
mutex_lock(&wq_pool_mutex);
/*
@@ -6878,8 +7847,12 @@ void __init workqueue_init_topology(void)
* combinations to apply per-pod sharing.
*/
list_for_each_entry(wq, &workqueues, list) {
- for_each_online_cpu(cpu) {
+ for_each_online_cpu(cpu)
wq_update_pod(wq, cpu, cpu, true);
+ if (wq->flags & WQ_UNBOUND) {
+ mutex_lock(&wq->mutex);
+ wq_update_node_max_active(wq, -1);
+ mutex_unlock(&wq->mutex);
}
}