diff options
Diffstat (limited to 'kernel')
51 files changed, 2579 insertions, 1347 deletions
diff --git a/kernel/async.c b/kernel/async.c index 97f224a5257b..4c3e6a44595f 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -64,6 +64,7 @@ static async_cookie_t next_cookie = 1; static LIST_HEAD(async_global_pending); /* pending from all registered doms */ static ASYNC_DOMAIN(async_dfl_domain); static DEFINE_SPINLOCK(async_lock); +static struct workqueue_struct *async_wq; struct async_entry { struct list_head domain_list; @@ -174,7 +175,7 @@ static async_cookie_t __async_schedule_node_domain(async_func_t func, spin_unlock_irqrestore(&async_lock, flags); /* schedule for execution */ - queue_work_node(node, system_unbound_wq, &entry->work); + queue_work_node(node, async_wq, &entry->work); return newcookie; } @@ -345,3 +346,17 @@ bool current_is_async(void) return worker && worker->current_func == async_run_entry_fn; } EXPORT_SYMBOL_GPL(current_is_async); + +void __init async_init(void) +{ + /* + * Async can schedule a number of interdependent work items. However, + * unbound workqueues can handle only upto min_active interdependent + * work items. The default min_active of 8 isn't sufficient for async + * and can lead to stalls. Let's use a dedicated workqueue with raised + * min_active. + */ + async_wq = alloc_workqueue("async", WQ_UNBOUND, 0); + BUG_ON(!async_wq); + workqueue_set_min_active(async_wq, WQ_DFL_ACTIVE); +} diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index 370217dd7e39..a4181234232b 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -21,24 +21,20 @@ static void backtrace_test_normal(void) dump_stack(); } -static DECLARE_COMPLETION(backtrace_work); - -static void backtrace_test_irq_callback(unsigned long data) +static void backtrace_test_bh_workfn(struct work_struct *work) { dump_stack(); - complete(&backtrace_work); } -static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback); +static DECLARE_WORK(backtrace_bh_work, &backtrace_test_bh_workfn); -static void backtrace_test_irq(void) +static void backtrace_test_bh(void) { - pr_info("Testing a backtrace from irq context.\n"); + pr_info("Testing a backtrace from BH context.\n"); pr_info("The following trace is a kernel self test and not a bug!\n"); - init_completion(&backtrace_work); - tasklet_schedule(&backtrace_tasklet); - wait_for_completion(&backtrace_work); + queue_work(system_bh_wq, &backtrace_bh_work); + flush_work(&backtrace_bh_work); } #ifdef CONFIG_STACKTRACE @@ -65,7 +61,7 @@ static int backtrace_regression_test(void) pr_info("====[ backtrace testing ]===========\n"); backtrace_test_normal(); - backtrace_test_irq(); + backtrace_test_bh(); backtrace_test_saved(); pr_info("====[ end of backtrace testing ]====\n"); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8a0bb80fe48a..ef82ffc90cbe 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -178,7 +178,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, void **frames, int n, struct xdp_cpumap_stats *stats) { - struct xdp_rxq_info rxq; + struct xdp_rxq_info rxq = {}; struct xdp_buff xdp; int i, nframes = 0; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index be72824f32b2..d19cd863d294 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1101,6 +1101,7 @@ struct bpf_hrtimer { struct bpf_prog *prog; void __rcu *callback_fn; void *value; + struct rcu_head rcu; }; /* the actual struct hidden inside uapi struct bpf_timer */ @@ -1332,6 +1333,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) if (in_nmi()) return -EOPNOTSUPP; + rcu_read_lock(); __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; if (!t) { @@ -1353,6 +1355,7 @@ out: * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); + rcu_read_unlock(); return ret; } @@ -1407,7 +1410,7 @@ out: */ if (this_cpu_read(hrtimer_running) != t) hrtimer_cancel(&t->timer); - kfree(t); + kfree_rcu(t, rcu); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index e5c3500443c6..ec4e97c61eef 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -978,6 +978,8 @@ __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != __alignof__(struct bpf_iter_task)); + kit->pos = NULL; + switch (flags) { case BPF_TASK_ITER_ALL_THREADS: case BPF_TASK_ITER_ALL_PROCS: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 65f598694d55..ddea9567f755 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5227,7 +5227,9 @@ BTF_ID(struct, prog_test_ref_kfunc) #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) #endif +#ifdef CONFIG_BPF_JIT BTF_ID(struct, bpf_cpumask) +#endif BTF_ID(struct, task_struct) BTF_SET_END(rcu_protected_types) @@ -16600,6 +16602,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat { int i; + if (old->callback_depth > cur->callback_depth) + return false; + for (i = 0; i < MAX_BPF_REG; i++) if (!regsafe(env, &old->regs[i], &cur->regs[i], &env->idmap_scratch, exact)) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ba36c073304a..4237c8748715 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2562,7 +2562,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, update_partition_sd_lb(cs, old_prs); out_free: free_cpumasks(NULL, &tmp); - return 0; + return retval; } /** @@ -2598,9 +2598,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (alloc_cpumasks(NULL, &tmp)) - return -ENOMEM; - if (*buf) compute_effective_exclusive_cpumask(trialcs, NULL); @@ -2615,6 +2612,9 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval) return retval; + if (alloc_cpumasks(NULL, &tmp)) + return -ENOMEM; + if (old_prs) { if (cpumask_empty(trialcs->effective_xcpus)) { invalidate = true; @@ -3897,6 +3897,7 @@ static struct cftype legacy_files[] = { }, { + /* obsolete, may be removed in the future */ .name = "memory_spread_slab", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6ef0b35fc28c..70ae70d03823 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -458,6 +458,8 @@ static __always_inline void context_tracking_recursion_exit(void) * __ct_user_enter - Inform the context tracking that the CPU is going * to enter user or guest space mode. * + * @state: userspace context-tracking state to enter. + * * This function must be called right before we switch from the kernel * to user or guest space, when it's guaranteed the remaining kernel * instructions to execute won't use any RCU read side critical section @@ -595,6 +597,8 @@ NOKPROBE_SYMBOL(user_enter_callable); * __ct_user_exit - Inform the context tracking that the CPU is * exiting user or guest mode and entering the kernel. * + * @state: userspace context-tracking state being exited from. + * * This function must be called after we entered the kernel from user or * guest space before any use of RCU read side critical section. This * potentially include any high level kernel code like syscalls, exceptions, diff --git a/kernel/cpu.c b/kernel/cpu.c index e6ec3ba4950b..cc4a8068747c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -54,7 +54,6 @@ * @rollback: Perform a rollback * @single: Single callback invocation * @bringup: Single callback bringup or teardown selector - * @cpu: CPU number * @node: Remote CPU node; for multi-instance, do a * single entry callback for install/remove * @last: For multi-instance rollback, remember how far we got @@ -3005,7 +3004,7 @@ static ssize_t control_show(struct device *dev, return sysfs_emit(buf, "%d\n", cpu_smt_num_threads); #endif - return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); + return sysfs_emit(buf, "%s\n", state); } static ssize_t control_store(struct device *dev, struct device_attribute *attr, @@ -3018,7 +3017,7 @@ static DEVICE_ATTR_RW(control); static ssize_t active_show(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active()); + return sysfs_emit(buf, "%d\n", sched_smt_active()); } static DEVICE_ATTR_RO(active); @@ -3107,10 +3106,10 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE -struct cpumask __cpu_possible_mask __read_mostly +struct cpumask __cpu_possible_mask __ro_after_init = {CPU_BITS_ALL}; #else -struct cpumask __cpu_possible_mask __read_mostly; +struct cpumask __cpu_possible_mask __ro_after_init; #endif EXPORT_SYMBOL(__cpu_possible_mask); diff --git a/kernel/exit.c b/kernel/exit.c index 3988a02efaef..41a12630cbbc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; + /* + * sub-thread or delay_group_leader(), wake up the + * PIDFD_THREAD waiters. + */ + if (!thread_group_empty(tsk)) + do_notify_pidfd(tsk); + if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && @@ -1127,17 +1134,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads - * which can reap other children at the same time. Until - * we change k_getrusage()-like users to rely on this lock - * we have to take ->siglock as well. + * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); - spin_lock_irq(¤t->sighand->siglock); - write_seqlock(&psig->stats_lock); + write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1160,8 +1164,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); - write_sequnlock(&psig->stats_lock); - spin_unlock_irq(¤t->sighand->siglock); + write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) @@ -1893,30 +1896,6 @@ Efault: } #endif -/** - * thread_group_exited - check that a thread group has exited - * @pid: tgid of thread group to be checked. - * - * Test if the thread group represented by tgid has exited (all - * threads are zombies, dead or completely gone). - * - * Return: true if the thread group has exited. false otherwise. - */ -bool thread_group_exited(struct pid *pid) -{ - struct task_struct *task; - bool exited; - - rcu_read_lock(); - task = pid_task(pid, PIDTYPE_PID); - exited = !task || - (READ_ONCE(task->exit_state) && thread_group_empty(task)); - rcu_read_unlock(); - - return exited; -} -EXPORT_SYMBOL(thread_group_exited); - /* * This needs to be __function_aligned as GCC implicitly makes any * implementation of abort() cold and drops alignment specified by diff --git a/kernel/fork.c b/kernel/fork.c index 0d944e92a43f..39a5046c2f0b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -101,6 +101,8 @@ #include <linux/user_events.h> #include <linux/iommu.h> #include <linux/rseq.h> +#include <uapi/linux/pidfd.h> +#include <linux/pidfs.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -1976,6 +1978,7 @@ static inline void rcu_copy_process(struct task_struct *p) p->rcu_tasks_holdout = false; INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; + INIT_LIST_HEAD(&p->rcu_tasks_exit_list); #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; @@ -1985,119 +1988,6 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -struct pid *pidfd_pid(const struct file *file) -{ - if (file->f_op == &pidfd_fops) - return file->private_data; - - return ERR_PTR(-EBADF); -} - -static int pidfd_release(struct inode *inode, struct file *file) -{ - struct pid *pid = file->private_data; - - file->private_data = NULL; - put_pid(pid); - return 0; -} - -#ifdef CONFIG_PROC_FS -/** - * pidfd_show_fdinfo - print information about a pidfd - * @m: proc fdinfo file - * @f: file referencing a pidfd - * - * Pid: - * This function will print the pid that a given pidfd refers to in the - * pid namespace of the procfs instance. - * If the pid namespace of the process is not a descendant of the pid - * namespace of the procfs instance 0 will be shown as its pid. This is - * similar to calling getppid() on a process whose parent is outside of - * its pid namespace. - * - * NSpid: - * If pid namespaces are supported then this function will also print - * the pid of a given pidfd refers to for all descendant pid namespaces - * starting from the current pid namespace of the instance, i.e. the - * Pid field and the first entry in the NSpid field will be identical. - * If the pid namespace of the process is not a descendant of the pid - * namespace of the procfs instance 0 will be shown as its first NSpid - * entry and no others will be shown. - * Note that this differs from the Pid and NSpid fields in - * /proc/<pid>/status where Pid and NSpid are always shown relative to - * the pid namespace of the procfs instance. The difference becomes - * obvious when sending around a pidfd between pid namespaces from a - * different branch of the tree, i.e. where no ancestral relation is - * present between the pid namespaces: - * - create two new pid namespaces ns1 and ns2 in the initial pid - * namespace (also take care to create new mount namespaces in the - * new pid namespace and mount procfs) - * - create a process with a pidfd in ns1 - * - send pidfd from ns1 to ns2 - * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid - * have exactly one entry, which is 0 - */ -static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct pid *pid = f->private_data; - struct pid_namespace *ns; - pid_t nr = -1; - - if (likely(pid_has_task(pid, PIDTYPE_PID))) { - ns = proc_pid_ns(file_inode(m->file)->i_sb); - nr = pid_nr_ns(pid, ns); - } - - seq_put_decimal_ll(m, "Pid:\t", nr); - -#ifdef CONFIG_PID_NS - seq_put_decimal_ll(m, "\nNSpid:\t", nr); - if (nr > 0) { - int i; - - /* If nr is non-zero it means that 'pid' is valid and that - * ns, i.e. the pid namespace associated with the procfs - * instance, is in the pid namespace hierarchy of pid. - * Start at one below the already printed level. - */ - for (i = ns->level + 1; i <= pid->level; i++) - seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); - } -#endif - seq_putc(m, '\n'); -} -#endif - -/* - * Poll support for process exit notification. - */ -static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) -{ - struct pid *pid = file->private_data; - __poll_t poll_flags = 0; - - poll_wait(file, &pid->wait_pidfd, pts); - - /* - * Inform pollers only when the whole thread group exits. - * If the thread group leader exits before all other threads in the - * group, then poll(2) should block, similar to the wait(2) family. - */ - if (thread_group_exited(pid)) - poll_flags = EPOLLIN | EPOLLRDNORM; - - return poll_flags; -} - -const struct file_operations pidfd_fops = { - .release = pidfd_release, - .poll = pidfd_poll, -#ifdef CONFIG_PROC_FS - .show_fdinfo = pidfd_show_fdinfo, -#endif -}; - /** * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd @@ -2131,20 +2021,20 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re int pidfd; struct file *pidfd_file; - if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) - return -EINVAL; - - pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + pidfd = get_unused_fd_flags(O_CLOEXEC); if (pidfd < 0) return pidfd; - pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - flags | O_RDWR | O_CLOEXEC); + pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); if (IS_ERR(pidfd_file)) { put_unused_fd(pidfd); return PTR_ERR(pidfd_file); } - get_pid(pid); /* held by pidfd_file now */ + /* + * anon_inode_getfile() ignores everything outside of the + * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. + */ + pidfd_file->f_flags |= (flags & PIDFD_THREAD); *ret = pidfd_file; return pidfd; } @@ -2158,7 +2048,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * - * The helper verifies that @pid is used as a thread group leader. + * The helper verifies that @pid is still in use, without PIDFD_THREAD the + * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in @@ -2177,7 +2068,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + bool thread = flags & PIDFD_THREAD; + + if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) return -EINVAL; return __pidfd_prepare(pid, flags, ret); @@ -2299,9 +2192,8 @@ __latent_entropy struct task_struct *copy_process( /* * - CLONE_DETACHED is blocked so that we can potentially * reuse it later for CLONE_PIDFD. - * - CLONE_THREAD is blocked until someone really needs it. */ - if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) + if (clone_flags & CLONE_DETACHED) return ERR_PTR(-EINVAL); } @@ -2524,8 +2416,10 @@ __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { + int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; + /* Note that no task has been attached to @pid yet. */ - retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile); + retval = __pidfd_prepare(pid, flags, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; @@ -2876,8 +2770,8 @@ pid_t kernel_clone(struct kernel_clone_args *args) * here has the advantage that we don't need to have a separate helper * to check for legacy clone(). */ - if ((args->flags & CLONE_PIDFD) && - (args->flags & CLONE_PARENT_SETTID) && + if ((clone_flags & CLONE_PIDFD) && + (clone_flags & CLONE_PARENT_SETTID) && (args->pidfd == args->parent_tid)) return -EINVAL; diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index dd76323ea3fd..38d6ae651ac7 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -4,10 +4,11 @@ * Copyright (C) 2020 Bartosz Golaszewski <bgolaszewski@baylibre.com> */ +#include <linux/cleanup.h> +#include <linux/interrupt.h> #include <linux/irq.h> #include <linux/irq_sim.h> #include <linux/irq_work.h> -#include <linux/interrupt.h> #include <linux/slab.h> struct irq_sim_work_ctx { @@ -19,7 +20,6 @@ struct irq_sim_work_ctx { }; struct irq_sim_irq_ctx { - int irqnum; bool enabled; struct irq_sim_work_ctx *work_ctx; }; @@ -164,33 +164,27 @@ static const struct irq_domain_ops irq_sim_domain_ops = { struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, unsigned int num_irqs) { - struct irq_sim_work_ctx *work_ctx; + struct irq_sim_work_ctx *work_ctx __free(kfree) = + kmalloc(sizeof(*work_ctx), GFP_KERNEL); - work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL); if (!work_ctx) - goto err_out; + return ERR_PTR(-ENOMEM); - work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL); - if (!work_ctx->pending) - goto err_free_work_ctx; + unsigned long *pending __free(bitmap) = bitmap_zalloc(num_irqs, GFP_KERNEL); + if (!pending) + return ERR_PTR(-ENOMEM); work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs, &irq_sim_domain_ops, work_ctx); if (!work_ctx->domain) - goto err_free_bitmap; + return ERR_PTR(-ENOMEM); work_ctx->irq_count = num_irqs; work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq); + work_ctx->pending = no_free_ptr(pending); - return work_ctx->domain; - -err_free_bitmap: - bitmap_free(work_ctx->pending); -err_free_work_ctx: - kfree(work_ctx); -err_out: - return ERR_PTR(-ENOMEM); + return no_free_ptr(work_ctx)->domain; } EXPORT_SYMBOL_GPL(irq_domain_create_sim); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 371eb1711d34..4c6b32318ce3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -92,11 +92,23 @@ static void desc_smp_init(struct irq_desc *desc, int node, #endif } +static void free_masks(struct irq_desc *desc) +{ +#ifdef CONFIG_GENERIC_PENDING_IRQ + free_cpumask_var(desc->pending_mask); +#endif + free_cpumask_var(desc->irq_common_data.affinity); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + free_cpumask_var(desc->irq_common_data.effective_affinity); +#endif +} + #else static inline int alloc_masks(struct irq_desc *desc, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } +static inline void free_masks(struct irq_desc *desc) { } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, @@ -166,6 +178,39 @@ static void delete_irq_desc(unsigned int irq) } #ifdef CONFIG_SPARSE_IRQ +static const struct kobj_type irq_kobj_type; +#endif + +static int init_desc(struct irq_desc *desc, int irq, int node, + unsigned int flags, + const struct cpumask *affinity, + struct module *owner) +{ + desc->kstat_irqs = alloc_percpu(unsigned int); + if (!desc->kstat_irqs) + return -ENOMEM; + + if (alloc_masks(desc, node)) { + free_percpu(desc->kstat_irqs); + return -ENOMEM; + } + + raw_spin_lock_init(&desc->lock); + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + mutex_init(&desc->request_mutex); + init_waitqueue_head(&desc->wait_for_threads); + desc_set_defaults(irq, desc, node, affinity, owner); + irqd_set(&desc->irq_data, flags); + irq_resend_init(desc); +#ifdef CONFIG_SPARSE_IRQ + kobject_init(&desc->kobj, &irq_kobj_type); + init_rcu_head(&desc->rcu); +#endif + + return 0; +} + +#ifdef CONFIG_SPARSE_IRQ static void irq_kobj_release(struct kobject *kobj); @@ -384,21 +429,6 @@ struct irq_desc *irq_to_desc(unsigned int irq) EXPORT_SYMBOL_GPL(irq_to_desc); #endif -#ifdef CONFIG_SMP -static void free_masks(struct irq_desc *desc) -{ -#ifdef CONFIG_GENERIC_PENDING_IRQ - free_cpumask_var(desc->pending_mask); -#endif - free_cpumask_var(desc->irq_common_data.affinity); -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK - free_cpumask_var(desc->irq_common_data.effective_affinity); -#endif -} -#else -static inline void free_masks(struct irq_desc *desc) { } -#endif - void irq_lock_sparse(void) { mutex_lock(&sparse_irq_lock); @@ -414,36 +444,19 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, struct module *owner) { struct irq_desc *desc; + int ret; desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node); if (!desc) return NULL; - /* allocate based on nr_cpu_ids */ - desc->kstat_irqs = alloc_percpu(unsigned int); - if (!desc->kstat_irqs) - goto err_desc; - - if (alloc_masks(desc, node)) - goto err_kstat; - raw_spin_lock_init(&desc->lock); - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - mutex_init(&desc->request_mutex); - init_rcu_head(&desc->rcu); - init_waitqueue_head(&desc->wait_for_threads); - - desc_set_defaults(irq, desc, node, affinity, owner); - irqd_set(&desc->irq_data, flags); - kobject_init(&desc->kobj, &irq_kobj_type); - irq_resend_init(desc); + ret = init_desc(desc, irq, node, flags, affinity, owner); + if (unlikely(ret)) { + kfree(desc); + return NULL; + } return desc; - -err_kstat: - free_percpu(desc->kstat_irqs); -err_desc: - kfree(desc); - return NULL; } static void irq_kobj_release(struct kobject *kobj) @@ -583,26 +596,29 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { int __init early_irq_init(void) { int count, i, node = first_online_node; - struct irq_desc *desc; + int ret; init_irq_default_affinity(); printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); - desc = irq_desc; count = ARRAY_SIZE(irq_desc); for (i = 0; i < count; i++) { - desc[i].kstat_irqs = alloc_percpu(unsigned int); - alloc_masks(&desc[i], node); - raw_spin_lock_init(&desc[i].lock); - lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - mutex_init(&desc[i].request_mutex); - init_waitqueue_head(&desc[i].wait_for_threads); - desc_set_defaults(i, &desc[i], node, NULL, NULL); - irq_resend_init(&desc[i]); + ret = init_desc(irq_desc + i, i, node, 0, NULL, NULL); + if (unlikely(ret)) + goto __free_desc_res; } + return arch_early_irq_init(); + +__free_desc_res: + while (--i >= 0) { + free_masks(irq_desc + i); + free_percpu(irq_desc[i].kstat_irqs); + } + + return ret; } struct irq_desc *irq_to_desc(unsigned int irq) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 0bdef4fe925b..3dd1c871e091 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -29,6 +29,7 @@ static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity); static void irq_domain_check_hierarchy(struct irq_domain *domain); +static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq); struct irqchip_fwid { struct fwnode_handle fwnode; @@ -448,7 +449,7 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { - if (h->ops->select && fwspec->param_count) + if (h->ops->select && bus_token != DOMAIN_BUS_ANY) rc = h->ops->select(h, fwspec, bus_token); else if (h->ops->match) rc = h->ops->match(h, to_of_node(fwnode), bus_token); @@ -858,8 +859,13 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) } if (irq_domain_is_hierarchy(domain)) { - virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE, - fwspec, false, NULL); + if (irq_domain_is_msi_device(domain)) { + mutex_unlock(&domain->root->mutex); + virq = msi_device_domain_alloc_wired(domain, hwirq, type); + mutex_lock(&domain->root->mutex); + } else + virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE, + fwspec, false, NULL); if (virq <= 0) { virq = 0; goto out; @@ -914,7 +920,7 @@ void irq_dispose_mapping(unsigned int virq) return; if (irq_domain_is_hierarchy(domain)) { - irq_domain_free_irqs(virq, 1); + irq_domain_free_one_irq(domain, virq); } else { irq_domain_disassociate(domain, virq); irq_free_desc(virq); @@ -1755,6 +1761,14 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) irq_free_descs(virq, nr_irqs); } +static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) +{ + if (irq_domain_is_msi_device(domain)) + msi_device_domain_free_wired(domain, virq); + else + irq_domain_free_irqs(virq, 1); +} + /** * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain * @domain: Domain below which interrupts must be allocated @@ -1907,9 +1921,9 @@ static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, return -EINVAL; } -static void irq_domain_check_hierarchy(struct irq_domain *domain) -{ -} +static void irq_domain_check_hierarchy(struct irq_domain *domain) { } +static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { } + #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1782f90cd8c6..ad3eaf2ab959 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -192,10 +192,14 @@ void irq_set_thread_affinity(struct irq_desc *desc) struct irqaction *action; for_each_action_of_desc(desc, action) { - if (action->thread) + if (action->thread) { set_bit(IRQTF_AFFINITY, &action->thread_flags); - if (action->secondary && action->secondary->thread) + wake_up_process(action->thread); + } + if (action->secondary && action->secondary->thread) { set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags); + wake_up_process(action->secondary->thread); + } } } @@ -1049,10 +1053,57 @@ static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) return IRQ_NONE; } -static int irq_wait_for_interrupt(struct irqaction *action) +#ifdef CONFIG_SMP +/* + * Check whether we need to change the affinity of the interrupt thread. + */ +static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) +{ + cpumask_var_t mask; + bool valid = false; + + if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) + return; + + __set_current_state(TASK_RUNNING); + + /* + * In case we are out of memory we set IRQTF_AFFINITY again and + * try again next time + */ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + set_bit(IRQTF_AFFINITY, &action->thread_flags); + return; + } + + raw_spin_lock_irq(&desc->lock); + /* + * This code is triggered unconditionally. Check the affinity + * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. + */ + if (cpumask_available(desc->irq_common_data.affinity)) { + const struct cpumask *m; + + m = irq_data_get_effective_affinity_mask(&desc->irq_data); + cpumask_copy(mask, m); + valid = true; + } + raw_spin_unlock_irq(&desc->lock); + + if (valid) + set_cpus_allowed_ptr(current, mask); + free_cpumask_var(mask); +} +#else +static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } +#endif + +static int irq_wait_for_interrupt(struct irq_desc *desc, + struct irqaction *action) { for (;;) { set_current_state(TASK_INTERRUPTIBLE); + irq_thread_check_affinity(desc, action); if (kthread_should_stop()) { /* may need to run one last time */ @@ -1129,52 +1180,6 @@ out_unlock: chip_bus_sync_unlock(desc); } -#ifdef CONFIG_SMP -/* - * Check whether we need to change the affinity of the interrupt thread. - */ -static void -irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) -{ - cpumask_var_t mask; - bool valid = true; - - if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) - return; - - /* - * In case we are out of memory we set IRQTF_AFFINITY again and - * try again next time - */ - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { - set_bit(IRQTF_AFFINITY, &action->thread_flags); - return; - } - - raw_spin_lock_irq(&desc->lock); - /* - * This code is triggered unconditionally. Check the affinity - * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. - */ - if (cpumask_available(desc->irq_common_data.affinity)) { - const struct cpumask *m; - - m = irq_data_get_effective_affinity_mask(&desc->irq_data); - cpumask_copy(mask, m); - } else { - valid = false; - } - raw_spin_unlock_irq(&desc->lock); - - if (valid) - set_cpus_allowed_ptr(current, mask); - free_cpumask_var(mask); -} -#else -static inline void -irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } -#endif - /* * Interrupts which are not explicitly requested as threaded * interrupts rely on the implicit bh/preempt disable of the hard irq @@ -1312,13 +1317,9 @@ static int irq_thread(void *data) init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, TWA_NONE); - irq_thread_check_affinity(desc, action); - - while (!irq_wait_for_interrupt(action)) { + while (!irq_wait_for_interrupt(desc, action)) { irqreturn_t action_ret; - irq_thread_check_affinity(desc, action); - action_ret = handler_fn(desc, action); if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 75d0ae490e29..8f222d1cccec 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -8,8 +8,6 @@ #include <linux/cpu.h> #include <linux/irq.h> -#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS)) - struct cpumap { unsigned int available; unsigned int allocated; @@ -17,8 +15,8 @@ struct cpumap { unsigned int managed_allocated; bool initialized; bool online; - unsigned long alloc_map[IRQ_MATRIX_SIZE]; - unsigned long managed_map[IRQ_MATRIX_SIZE]; + unsigned long *managed_map; + unsigned long alloc_map[]; }; struct irq_matrix { @@ -32,8 +30,8 @@ struct irq_matrix { unsigned int total_allocated; unsigned int online_maps; struct cpumap __percpu *maps; - unsigned long scratch_map[IRQ_MATRIX_SIZE]; - unsigned long system_map[IRQ_MATRIX_SIZE]; + unsigned long *system_map; + unsigned long scratch_map[]; }; #define CREATE_TRACE_POINTS @@ -50,24 +48,32 @@ __init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits, unsigned int alloc_start, unsigned int alloc_end) { + unsigned int cpu, matrix_size = BITS_TO_LONGS(matrix_bits); struct irq_matrix *m; - if (matrix_bits > IRQ_MATRIX_BITS) - return NULL; - - m = kzalloc(sizeof(*m), GFP_KERNEL); + m = kzalloc(struct_size(m, scratch_map, matrix_size * 2), GFP_KERNEL); if (!m) return NULL; + m->system_map = &m->scratch_map[matrix_size]; + m->matrix_bits = matrix_bits; m->alloc_start = alloc_start; m->alloc_end = alloc_end; m->alloc_size = alloc_end - alloc_start; - m->maps = alloc_percpu(*m->maps); + m->maps = __alloc_percpu(struct_size(m->maps, alloc_map, matrix_size * 2), + __alignof__(*m->maps)); if (!m->maps) { kfree(m); return NULL; } + + for_each_possible_cpu(cpu) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + + cm->managed_map = &cm->alloc_map[matrix_size]; + } + return m; } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 79b4a58ba9c3..f90952ebc494 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -726,11 +726,26 @@ static void msi_domain_free(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_top(domain, virq, nr_irqs); } +static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fwspec, + irq_hw_number_t *hwirq, unsigned int *type) +{ + struct msi_domain_info *info = domain->host_data; + + /* + * This will catch allocations through the regular irqdomain path except + * for MSI domains which really support this, e.g. MBIGEN. + */ + if (!info->ops->msi_translate) + return -ENOTSUPP; + return info->ops->msi_translate(domain, fwspec, hwirq, type); +} + static const struct irq_domain_ops msi_domain_ops = { .alloc = msi_domain_alloc, .free = msi_domain_free, .activate = msi_domain_activate, .deactivate = msi_domain_deactivate, + .translate = msi_domain_translate, }; static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, @@ -830,8 +845,11 @@ static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode, domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0, fwnode, &msi_domain_ops, info); - if (domain) + if (domain) { irq_domain_update_bus_token(domain, info->bus_token); + if (info->flags & MSI_FLAG_PARENT_PM_DEV) + domain->pm_dev = parent->pm_dev; + } return domain; } @@ -945,9 +963,9 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, void *chip_data) { struct irq_domain *domain, *parent = dev->msi.domain; - const struct msi_parent_ops *pops; + struct fwnode_handle *fwnode, *fwnalloced = NULL; struct msi_domain_template *bundle; - struct fwnode_handle *fwnode; + const struct msi_parent_ops *pops; if (!irq_domain_is_msi_parent(parent)) return false; @@ -970,7 +988,19 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, pops->prefix ? : "", bundle->chip.name, dev_name(dev)); bundle->chip.name = bundle->name; - fwnode = irq_domain_alloc_named_fwnode(bundle->name); + /* + * Using the device firmware node is required for wire to MSI + * device domains so that the existing firmware results in a domain + * match. + * All other device domains like PCI/MSI use the named firmware + * node as they are not guaranteed to have a fwnode. They are never + * looked up and always handled in the context of the device. + */ + if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE) + fwnode = dev->fwnode; + else + fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name); + if (!fwnode) goto free_bundle; @@ -997,7 +1027,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, fail: msi_unlock_descs(dev); free_fwnode: - irq_domain_free_fwnode(fwnode); + irq_domain_free_fwnode(fwnalloced); free_bundle: kfree(bundle); return false; @@ -1431,34 +1461,10 @@ int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int return msi_domain_alloc_locked(dev, &ctrl); } -/** - * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at - * a given index - or at the next free index - * - * @dev: Pointer to device struct of the device for which the interrupts - * are allocated - * @domid: Id of the interrupt domain to operate on - * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation - * uses the next free index. - * @affdesc: Optional pointer to an interrupt affinity descriptor structure - * @icookie: Optional pointer to a domain specific per instance cookie. If - * non-NULL the content of the cookie is stored in msi_desc::data. - * Must be NULL for MSI-X allocations - * - * This requires a MSI interrupt domain which lets the core code manage the - * MSI descriptors. - * - * Return: struct msi_map - * - * On success msi_map::index contains the allocated index number and - * msi_map::virq the corresponding Linux interrupt number - * - * On failure msi_map::index contains the error code and msi_map::virq - * is %0. - */ -struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index, - const struct irq_affinity_desc *affdesc, - union msi_instance_cookie *icookie) +static struct msi_map __msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, + unsigned int index, + const struct irq_affinity_desc *affdesc, + union msi_instance_cookie *icookie) { struct msi_ctrl ctrl = { .domid = domid, .nirqs = 1, }; struct irq_domain *domain; @@ -1466,17 +1472,16 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u struct msi_desc *desc; int ret; - msi_lock_descs(dev); domain = msi_get_device_domain(dev, domid); if (!domain) { map.index = -ENODEV; - goto unlock; + return map; } desc = msi_alloc_desc(dev, 1, affdesc); if (!desc) { map.index = -ENOMEM; - goto unlock; + return map; } if (icookie) @@ -1485,7 +1490,7 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u ret = msi_insert_desc(dev, desc, domid, index); if (ret) { map.index = ret; - goto unlock; + return map; } ctrl.first = ctrl.last = desc->msi_index; @@ -1498,11 +1503,90 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u map.index = desc->msi_index; map.virq = desc->irq; } -unlock: + return map; +} + +/** + * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at + * a given index - or at the next free index + * + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @domid: Id of the interrupt domain to operate on + * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation + * uses the next free index. + * @affdesc: Optional pointer to an interrupt affinity descriptor structure + * @icookie: Optional pointer to a domain specific per instance cookie. If + * non-NULL the content of the cookie is stored in msi_desc::data. + * Must be NULL for MSI-X allocations + * + * This requires a MSI interrupt domain which lets the core code manage the + * MSI descriptors. + * + * Return: struct msi_map + * + * On success msi_map::index contains the allocated index number and + * msi_map::virq the corresponding Linux interrupt number + * + * On failure msi_map::index contains the error code and msi_map::virq + * is %0. + */ +struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index, + const struct irq_affinity_desc *affdesc, + union msi_instance_cookie *icookie) +{ + struct msi_map map; + + msi_lock_descs(dev); + map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); msi_unlock_descs(dev); return map; } +/** + * msi_device_domain_alloc_wired - Allocate a "wired" interrupt on @domain + * @domain: The domain to allocate on + * @hwirq: The hardware interrupt number to allocate for + * @type: The interrupt type + * + * This weirdness supports wire to MSI controllers like MBIGEN. + * + * @hwirq is the hardware interrupt number which is handed in from + * irq_create_fwspec_mapping(). As the wire to MSI domain is sparse, but + * sized in firmware, the hardware interrupt number cannot be used as MSI + * index. For the underlying irq chip the MSI index is irrelevant and + * all it needs is the hardware interrupt number. + * + * To handle this the MSI index is allocated with MSI_ANY_INDEX and the + * hardware interrupt number is stored along with the type information in + * msi_desc::cookie so the underlying interrupt chip and domain code can + * retrieve it. + * + * Return: The Linux interrupt number (> 0) or an error code + */ +int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, + unsigned int type) +{ + unsigned int domid = MSI_DEFAULT_DOMAIN; + union msi_instance_cookie icookie = { }; + struct device *dev = domain->dev; + struct msi_map map = { }; + + if (WARN_ON_ONCE(!dev || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI)) + return -EINVAL; + + icookie.value = ((u64)type << 32) | hwirq; + + msi_lock_descs(dev); + if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain)) + map.index = -EINVAL; + else + map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie); + msi_unlock_descs(dev); + + return map.index >= 0 ? map.virq : map.index; +} + static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain, struct msi_ctrl *ctrl) { @@ -1629,6 +1713,30 @@ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid) } /** + * msi_device_domain_free_wired - Free a wired interrupt in @domain + * @domain: The domain to free the interrupt on + * @virq: The Linux interrupt number to free + * + * This is the counterpart of msi_device_domain_alloc_wired() for the + * weird wired to MSI converting domains. + */ +void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq) +{ + struct msi_desc *desc = irq_get_msi_desc(virq); + struct device *dev = domain->dev; + + if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI)) + return; + + msi_lock_descs(dev); + if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) { + msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, + desc->msi_index); + } + msi_unlock_descs(dev); +} + +/** * msi_get_domain_info - Get the MSI interrupt domain info for @domain * @domain: The interrupt domain to retrieve data from * diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d5a0ee40bf66..9d9095e81792 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1993,7 +1993,7 @@ NOKPROBE_SYMBOL(__kretprobe_find_ret_addr); unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, struct llist_node **cur) { - struct kretprobe_instance *ri = NULL; + struct kretprobe_instance *ri; kprobe_opcode_t *ret; if (WARN_ON_ONCE(!cur)) @@ -2802,7 +2802,7 @@ static int show_kprobe_addr(struct seq_file *pi, void *v) { struct hlist_head *head; struct kprobe *p, *kp; - const char *sym = NULL; + const char *sym; unsigned int i = *(loff_t *) v; unsigned long offset = 0; char *modname, namebuf[KSYM_NAME_LEN]; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 15781acaac1c..6ec3deec68c2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -573,7 +573,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(f.file)) err = validate_ns(&nsset, ns); else - err = validate_nsset(&nsset, f.file->private_data); + err = validate_nsset(&nsset, pidfd_pid(f.file)); if (!err) { commit_nsset(&nsset); perf_event_namespaces(current); diff --git a/kernel/pid.c b/kernel/pid.c index b52b10865454..99a0c5eb24b8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> +#include <linux/pidfs.h> #include <net/sock.h> #include <uapi/linux/pidfd.h> @@ -65,6 +66,13 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; +#ifdef CONFIG_FS_PID +/* + * Pseudo filesystems start inode numbering after one. We use Reserved + * PIDs as a natural offset. + */ +static u64 pidfs_ino = RESERVED_PIDS; +#endif /* * PID-map pages start out as NULL, they get allocated upon @@ -272,6 +280,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; +#ifdef CONFIG_FS_PID + pid->stashed = NULL; + pid->ino = ++pidfs_ino; +#endif for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); @@ -349,6 +361,11 @@ static void __change_pid(struct task_struct *task, enum pid_type type, hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; + if (type == PIDTYPE_PID) { + WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); + wake_up_all(&pid->wait_pidfd); + } + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; @@ -391,8 +408,7 @@ void exchange_tids(struct task_struct *left, struct task_struct *right) void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { - if (type == PIDTYPE_PID) - new->thread_pid = old->thread_pid; + WARN_ON_ONCE(type == PIDTYPE_PID); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } @@ -552,11 +568,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) * Return the task associated with @pidfd. The function takes a reference on * the returned task. The caller is responsible for releasing that reference. * - * Currently, the process identified by @pidfd is always a thread-group leader. - * This restriction currently exists for all aspects of pidfds including pidfd - * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling - * (only supports thread group leaders). - * * Return: On success, the task_struct associated with the pidfd. * On error, a negative errno number will be returned. */ @@ -595,7 +606,7 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -int pidfd_create(struct pid *pid, unsigned int flags) +static int pidfd_create(struct pid *pid, unsigned int flags) { int pidfd; struct file *pidfd_file; @@ -615,11 +626,8 @@ int pidfd_create(struct pid *pid, unsigned int flags) * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set for - * the process identified by @pid. Currently, the process identified by - * @pid must be a thread-group leader. This restriction currently exists - * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot - * be used with CLONE_THREAD) and pidfd polling (only supports thread group - * leaders). + * the task identified by @pid. Without PIDFD_THREAD flag the target task + * must be a thread-group leader. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. @@ -629,7 +637,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) int fd; struct pid *p; - if (flags & ~PIDFD_NONBLOCK) + if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) return -EINVAL; if (pid <= 0) @@ -682,7 +690,26 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) up_read(&task->signal->exec_update_lock); - return file ?: ERR_PTR(-EBADF); + if (!file) { + /* + * It is possible that the target thread is exiting; it can be + * either: + * 1. before exit_signals(), which gives a real fd + * 2. before exit_files() takes the task_lock() gives a real fd + * 3. after exit_files() releases task_lock(), ->files is NULL; + * this has PF_EXITING, since it was set in exit_signals(), + * __pidfd_fget() returns EBADF. + * In case 3 we get EBADF, but that really means ESRCH, since + * the task is currently exiting and has freed its files + * struct, so we fix it up. + */ + if (task->flags & PF_EXITING) + file = ERR_PTR(-ESRCH); + else + file = ERR_PTR(-EBADF); + } + + return file; } static int pidfd_getfd(struct pid *pid, int fd) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 6053ddddaf65..692f12fe60c1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -222,7 +222,7 @@ int swsusp_swap_in_use(void) */ static unsigned short root_swap = 0xffff; -static struct bdev_handle *hib_resume_bdev_handle; +static struct file *hib_resume_bdev_file; struct hib_bio_batch { atomic_t count; @@ -276,7 +276,7 @@ static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(hib_resume_bdev_handle->bdev, 1, opf, + bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf, GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); @@ -357,14 +357,14 @@ static int swsusp_swap_check(void) return res; root_swap = res; - hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device, + hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device, BLK_OPEN_WRITE, NULL, NULL); - if (IS_ERR(hib_resume_bdev_handle)) - return PTR_ERR(hib_resume_bdev_handle); + if (IS_ERR(hib_resume_bdev_file)) + return PTR_ERR(hib_resume_bdev_file); - res = set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE); + res = set_blocksize(file_bdev(hib_resume_bdev_file), PAGE_SIZE); if (res < 0) - bdev_release(hib_resume_bdev_handle); + fput(hib_resume_bdev_file); return res; } @@ -1523,10 +1523,10 @@ int swsusp_check(bool exclusive) void *holder = exclusive ? &swsusp_holder : NULL; int error; - hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device, + hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device, BLK_OPEN_READ, holder, NULL); - if (!IS_ERR(hib_resume_bdev_handle)) { - set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE); + if (!IS_ERR(hib_resume_bdev_file)) { + set_blocksize(file_bdev(hib_resume_bdev_file), PAGE_SIZE); clear_page(swsusp_header); error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); @@ -1551,11 +1551,11 @@ int swsusp_check(bool exclusive) put: if (error) - bdev_release(hib_resume_bdev_handle); + fput(hib_resume_bdev_file); else pr_debug("Image signature found, resuming\n"); } else { - error = PTR_ERR(hib_resume_bdev_handle); + error = PTR_ERR(hib_resume_bdev_file); } if (error) @@ -1570,12 +1570,12 @@ put: void swsusp_close(void) { - if (IS_ERR(hib_resume_bdev_handle)) { + if (IS_ERR(hib_resume_bdev_file)) { pr_debug("Image device not initialised\n"); return; } - bdev_release(hib_resume_bdev_handle); + fput(hib_resume_bdev_file); } /** diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bdd7eadb33d8..e7d2dd267593 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -314,6 +314,19 @@ config RCU_LAZY To save power, batch RCU callbacks and flush after delay, memory pressure, or callback list growing too big. + Requires rcu_nocbs=all to be set. + + Use rcutree.enable_rcu_lazy=0 to turn it off at boot time. + +config RCU_LAZY_DEFAULT_OFF + bool "Turn RCU lazy invocation off by default" + depends on RCU_LAZY + default n + help + Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default + off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch + it back on. + config RCU_DOUBLE_CHECK_CB_TIME bool "RCU callback-batch backup time check" depends on RCU_EXPERT diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f94f65877f2b..86fce206560e 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -528,6 +528,12 @@ struct task_struct *get_rcu_tasks_gp_kthread(void); struct task_struct *get_rcu_tasks_rude_gp_kthread(void); #endif // # ifdef CONFIG_TASKS_RUDE_RCU +#ifdef CONFIG_TASKS_RCU_GENERIC +void tasks_cblist_init_generic(void); +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void tasks_cblist_init_generic(void) { } +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ + #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 @@ -543,11 +549,11 @@ enum rcutorture_type { }; #if defined(CONFIG_RCU_LAZY) -unsigned long rcu_lazy_get_jiffies_till_flush(void); -void rcu_lazy_set_jiffies_till_flush(unsigned long j); +unsigned long rcu_get_jiffies_lazy_flush(void); +void rcu_set_jiffies_lazy_flush(unsigned long j); #else -static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; } -static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { } +static inline unsigned long rcu_get_jiffies_lazy_flush(void) { return 0; } +static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { } #endif #if defined(CONFIG_TREE_RCU) @@ -623,12 +629,7 @@ int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; -#ifdef CONFIG_RCU_EXP_KTHREAD extern struct kthread_worker *rcu_exp_gp_kworker; -extern struct kthread_worker *rcu_exp_par_gp_kworker; -#else /* !CONFIG_RCU_EXP_KTHREAD */ -extern struct workqueue_struct *rcu_par_gp_wq; -#endif /* CONFIG_RCU_EXP_KTHREAD */ void rcu_gp_slow_register(atomic_t *rgssp); void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index ffdb30495e3c..8db4fedaaa1e 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -764,9 +764,9 @@ kfree_scale_init(void) if (kfree_by_call_rcu) { /* do a test to check the timeout. */ - orig_jif = rcu_lazy_get_jiffies_till_flush(); + orig_jif = rcu_get_jiffies_lazy_flush(); - rcu_lazy_set_jiffies_till_flush(2 * HZ); + rcu_set_jiffies_lazy_flush(2 * HZ); rcu_barrier(); jif_start = jiffies; @@ -775,7 +775,7 @@ kfree_scale_init(void) smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1); - rcu_lazy_set_jiffies_till_flush(orig_jif); + rcu_set_jiffies_lazy_flush(orig_jif); if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7567ca8e743c..45d6b4c3d199 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1368,9 +1368,13 @@ rcu_torture_writer(void *arg) struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); + unsigned long stallsdone = jiffies; bool stutter_waited; unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + // If a new stall test is added, this must be adjusted. + if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu) + stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) pr_alert("%s" TORTURE_FLAG @@ -1576,11 +1580,11 @@ rcu_torture_writer(void *arg) !atomic_read(&rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && - boot_ended) + boot_ended && + time_after(jiffies, stallsdone)) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && - rcu_access_pointer(rcu_torture_current) != - &rcu_tortures[i]) { + rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { tracing_off(); show_rcu_gp_kthreads(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); @@ -2441,7 +2445,8 @@ static struct notifier_block rcu_torture_stall_block = { /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then - * induces a CPU stall for the time specified by stall_cpu. + * induces a CPU stall for the time specified by stall_cpu. If a new + * stall test is added, stallsdone in rcu_torture_writer() must be adjusted. */ static int rcu_torture_stall(void *args) { diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0351a4e83529..e4d673fc30f4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1234,11 +1234,20 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); /* - * The snapshot for acceleration must be taken _before_ the read of the - * current gp sequence used for advancing, otherwise advancing may fail - * and acceleration may then fail too. + * It's crucial to capture the snapshot 's' for acceleration before + * reading the current gp_seq that is used for advancing. This is + * essential because if the acceleration snapshot is taken after a + * failed advancement attempt, there's a risk that a grace period may + * conclude and a new one may start in the interim. If the snapshot is + * captured after this sequence of events, the acceleration snapshot 's' + * could be excessively advanced, leading to acceleration failure. + * In such a scenario, an 'acceleration leak' can occur, where new + * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment. + * Also note that encountering advancing failures is a normal + * occurrence when the grace period for RCU_WAIT_TAIL is in progress. * - * This could happen if: + * To see this, consider the following events which occur if + * rcu_seq_snap() were to be called after advance: * * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). @@ -1264,6 +1273,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, if (rhp) { rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + /* + * Acceleration can never fail because the base current gp_seq + * used for acceleration is <= the value of gp_seq used for + * advancing. This means that RCU_NEXT_TAIL segment will + * always be able to be emptied by the acceleration into the + * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments. + */ WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s)); } if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index e550f97779b8..86df878a2fee 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -24,22 +24,6 @@ void rcu_sync_init(struct rcu_sync *rsp) init_waitqueue_head(&rsp->gp_wait); } -/** - * rcu_sync_enter_start - Force readers onto slow path for multiple updates - * @rsp: Pointer to rcu_sync structure to use for synchronization - * - * Must be called after rcu_sync_init() and before first use. - * - * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() - * pairs turn into NO-OPs. - */ -void rcu_sync_enter_start(struct rcu_sync *rsp) -{ - rsp->gp_count++; - rsp->gp_state = GP_PASSED; -} - - static void rcu_sync_func(struct rcu_head *rhp); static void rcu_sync_call(struct rcu_sync *rsp) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 732ad5b39946..147b5945d67a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -32,6 +32,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @rtp_irq_work: IRQ work queue for deferred wakeups. * @barrier_q_head: RCU callback for barrier operation. * @rtp_blkd_tasks: List of tasks blocked as readers. + * @rtp_exit_list: List of tasks in the latter portion of do_exit(). * @cpu: CPU number corresponding to this entry. * @rtpp: Pointer to the rcu_tasks structure. */ @@ -46,6 +47,7 @@ struct rcu_tasks_percpu { struct irq_work rtp_irq_work; struct rcu_head barrier_q_head; struct list_head rtp_blkd_tasks; + struct list_head rtp_exit_list; int cpu; struct rcu_tasks *rtpp; }; @@ -144,8 +146,6 @@ static struct rcu_tasks rt_name = \ } #ifdef CONFIG_TASKS_RCU -/* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); /* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); @@ -240,7 +240,6 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) static void cblist_init_generic(struct rcu_tasks *rtp) { int cpu; - unsigned long flags; int lim; int shift; @@ -266,15 +265,15 @@ static void cblist_init_generic(struct rcu_tasks *rtp) WARN_ON_ONCE(!rtpcp); if (cpu) raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock)); - local_irq_save(flags); // serialize initialization if (rcu_segcblist_empty(&rtpcp->cblist)) rcu_segcblist_init(&rtpcp->cblist); - local_irq_restore(flags); INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); + if (!rtpcp->rtp_exit_list.next) + INIT_LIST_HEAD(&rtpcp->rtp_exit_list); } pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, @@ -851,10 +850,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // number of voluntary context switches, and add that task to the // holdout list. // rcu_tasks_postscan(): -// Invoke synchronize_srcu() to ensure that all tasks that were -// in the process of exiting (and which thus might not know to -// synchronize with this RCU Tasks grace period) have completed -// exiting. +// Gather per-CPU lists of tasks in do_exit() to ensure that all +// tasks that were in the process of exiting (and which thus might +// not know to synchronize with this RCU Tasks grace period) have +// completed exiting. The synchronize_rcu() in rcu_tasks_postgp() +// will take care of any tasks stuck in the non-preemptible region +// of do_exit() following its call to exit_tasks_rcu_stop(). // check_all_holdout_tasks(), repeatedly until holdout list is empty: // Scans the holdout list, attempting to identify a quiescent state // for each task on the list. If there is a quiescent state, the @@ -867,8 +868,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // with interrupts disabled. // // For each exiting task, the exit_tasks_rcu_start() and -// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU -// read-side critical sections waited for by rcu_tasks_postscan(). +// exit_tasks_rcu_finish() functions add and remove, respectively, the +// current task to a per-CPU list of tasks that rcu_tasks_postscan() must +// wait on. This is necessary because rcu_tasks_postscan() must wait on +// tasks that have already been removed from the global list of tasks. // // Pre-grace-period update-side code is ordered before the grace // via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code @@ -932,9 +935,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) } } +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); + /* Processing between scanning taskslist and draining the holdout list. */ static void rcu_tasks_postscan(struct list_head *hop) { + int cpu; int rtsi = READ_ONCE(rcu_task_stall_info); if (!IS_ENABLED(CONFIG_TINY_RCU)) { @@ -948,9 +955,9 @@ static void rcu_tasks_postscan(struct list_head *hop) * this, divide the fragile exit path part in two intersecting * read side critical sections: * - * 1) An _SRCU_ read side starting before calling exit_notify(), - * which may remove the task from the tasklist, and ending after - * the final preempt_disable() call in do_exit(). + * 1) A task_struct list addition before calling exit_notify(), + * which may remove the task from the tasklist, with the + * removal after the final preempt_disable() call in do_exit(). * * 2) An _RCU_ read side starting with the final preempt_disable() * call in do_exit() and ending with the final call to schedule() @@ -959,7 +966,37 @@ static void rcu_tasks_postscan(struct list_head *hop) * This handles the part 1). And postgp will handle part 2) with a * call to synchronize_rcu(). */ - synchronize_srcu(&tasks_rcu_exit_srcu); + + for_each_possible_cpu(cpu) { + unsigned long j = jiffies + 1; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu); + struct task_struct *t; + struct task_struct *t1; + struct list_head tmp; + + raw_spin_lock_irq_rcu_node(rtpcp); + list_for_each_entry_safe(t, t1, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) { + if (list_empty(&t->rcu_tasks_holdout_list)) + rcu_tasks_pertask(t, hop); + + // RT kernels need frequent pauses, otherwise + // pause at least once per pair of jiffies. + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && time_before(jiffies, j)) + continue; + + // Keep our place in the list while pausing. + // Nothing else traverses this list, so adding a + // bare list_head is OK. + list_add(&tmp, &t->rcu_tasks_exit_list); + raw_spin_unlock_irq_rcu_node(rtpcp); + cond_resched(); // For CONFIG_PREEMPT=n kernels + raw_spin_lock_irq_rcu_node(rtpcp); + t1 = list_entry(tmp.next, struct task_struct, rcu_tasks_exit_list); + list_del(&tmp); + j = jiffies + 1; + } + raw_spin_unlock_irq_rcu_node(rtpcp); + } if (!IS_ENABLED(CONFIG_TINY_RCU)) del_timer_sync(&tasks_rcu_exit_srcu_stall_timer); @@ -1027,7 +1064,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp) * * In addition, this synchronize_rcu() waits for exiting tasks * to complete their final preempt_disable() region of execution, - * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu), * enforcing the whole region before tasklist removal until * the final schedule() with TASK_DEAD state to be an RCU TASKS * read side critical section. @@ -1035,9 +1071,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp) synchronize_rcu(); } -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); - static void tasks_rcu_exit_srcu_stall(struct timer_list *unused) { #ifndef CONFIG_TINY_RCU @@ -1118,7 +1151,6 @@ module_param(rcu_tasks_lazy_ms, int, 0444); static int __init rcu_spawn_tasks_kthread(void) { - cblist_init_generic(&rcu_tasks); rcu_tasks.gp_sleep = HZ / 10; rcu_tasks.init_fract = HZ / 10; if (rcu_tasks_lazy_ms >= 0) @@ -1147,25 +1179,48 @@ struct task_struct *get_rcu_tasks_gp_kthread(void) EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread); /* - * Contribute to protect against tasklist scan blind spot while the - * task is exiting and may be removed from the tasklist. See - * corresponding synchronize_srcu() for further details. + * Protect against tasklist scan blind spot while the task is exiting and + * may be removed from the tasklist. Do this by adding the task to yet + * another list. + * + * Note that the task will remove itself from this list, so there is no + * need for get_task_struct(), except in the case where rcu_tasks_pertask() + * adds it to the holdout list, in which case rcu_tasks_pertask() supplies + * the needed get_task_struct(). */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +void exit_tasks_rcu_start(void) { - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + struct task_struct *t = current; + + WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list)); + preempt_disable(); + rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu); + t->rcu_tasks_exit_cpu = smp_processor_id(); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (!rtpcp->rtp_exit_list.next) + INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + preempt_enable(); } /* - * Contribute to protect against tasklist scan blind spot while the - * task is exiting and may be removed from the tasklist. See - * corresponding synchronize_srcu() for further details. + * Remove the task from the "yet another list" because do_exit() is now + * non-preemptible, allowing synchronize_rcu() to wait beyond this point. */ -void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu) +void exit_tasks_rcu_stop(void) { + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; struct task_struct *t = current; - __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); + WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list)); + rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_del_init(&t->rcu_tasks_exit_list); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } /* @@ -1282,7 +1337,6 @@ module_param(rcu_tasks_rude_lazy_ms, int, 0444); static int __init rcu_spawn_tasks_rude_kthread(void) { - cblist_init_generic(&rcu_tasks_rude); rcu_tasks_rude.gp_sleep = HZ / 10; if (rcu_tasks_rude_lazy_ms >= 0) rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms); @@ -1914,7 +1968,6 @@ module_param(rcu_tasks_trace_lazy_ms, int, 0444); static int __init rcu_spawn_tasks_trace_kthread(void) { - cblist_init_generic(&rcu_tasks_trace); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { rcu_tasks_trace.gp_sleep = HZ / 10; rcu_tasks_trace.init_fract = HZ / 10; @@ -2086,6 +2139,24 @@ late_initcall(rcu_tasks_verify_schedule_work); static void rcu_tasks_initiate_self_tests(void) { } #endif /* #else #ifdef CONFIG_PROVE_RCU */ +void __init tasks_cblist_init_generic(void) +{ + lockdep_assert_irqs_disabled(); + WARN_ON(num_online_cpus() > 1); + +#ifdef CONFIG_TASKS_RCU + cblist_init_generic(&rcu_tasks); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + cblist_init_generic(&rcu_tasks_rude); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + cblist_init_generic(&rcu_tasks_trace); +#endif +} + void __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index fec804b79080..705c0d16850a 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -261,4 +261,5 @@ void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); rcu_early_boot_tests(); + tasks_cblist_init_generic(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b2bccfd37c38..d9642dd06c25 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -145,7 +145,7 @@ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static struct task_struct *rcu_boost_task(struct rcu_node *rnp); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); @@ -2145,6 +2145,12 @@ static void rcu_do_batch(struct rcu_data *rdp) * Extract the list of ready callbacks, disabling IRQs to prevent * races with call_rcu() from interrupt handlers. Leave the * callback counts, as rcu_barrier() needs to be conservative. + * + * Callbacks execution is fully ordered against preceding grace period + * completion (materialized by rnp->gp_seq update) thanks to the + * smp_mb__after_unlock_lock() upon node locking required for callbacks + * advancing. In NOCB mode this ordering is then further relayed through + * the nocb locking that protects both callbacks advancing and extraction. */ rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); @@ -2591,12 +2597,26 @@ static int __init rcu_spawn_core_kthreads(void) return 0; } +static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) +{ + rcu_segcblist_enqueue(&rdp->cblist, head); + if (__is_kvfree_rcu_offset((unsigned long)func)) + trace_rcu_kvfree_callback(rcu_state.name, head, + (unsigned long)func, + rcu_segcblist_n_cbs(&rdp->cblist)); + else + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); +} + /* * Handle any core-RCU processing required by a call_rcu() invocation. */ -static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, - unsigned long flags) +static void call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags) { + rcutree_enqueue(rdp, head, func); /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2692,7 +2712,6 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) unsigned long flags; bool lazy; struct rcu_data *rdp; - bool was_alldone; /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); @@ -2729,30 +2748,18 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) } check_cb_ovld(rdp); - if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) - return; // Enqueued onto ->nocb_bypass, so just leave. - // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. - rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); - trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); - - /* Go handle any RCU core processing required. */ - if (unlikely(rcu_rdp_is_offloaded(rdp))) { - __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ - } else { - __call_rcu_core(rdp, head, flags); - local_irq_restore(flags); - } + if (unlikely(rcu_rdp_is_offloaded(rdp))) + call_rcu_nocb(rdp, head, func, flags, lazy); + else + call_rcu_core(rdp, head, func, flags); + local_irq_restore(flags); } #ifdef CONFIG_RCU_LAZY +static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF); +module_param(enable_rcu_lazy, bool, 0444); + /** * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and * flush all lazy callbacks (including the new one) to the main ->cblist while @@ -2778,6 +2785,8 @@ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) __call_rcu_common(head, func, false); } EXPORT_SYMBOL_GPL(call_rcu_hurry); +#else +#define enable_rcu_lazy false #endif /** @@ -2826,7 +2835,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); + __call_rcu_common(head, func, enable_rcu_lazy); } EXPORT_SYMBOL_GPL(call_rcu); @@ -4394,6 +4403,66 @@ rcu_boot_init_percpu_data(int cpu) rcu_boot_init_nocb_percpu_data(rdp); } +struct kthread_worker *rcu_exp_gp_kworker; + +static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) +{ + struct kthread_worker *kworker; + const char *name = "rcu_exp_par_gp_kthread_worker/%d"; + struct sched_param param = { .sched_priority = kthread_prio }; + int rnp_index = rnp - rcu_get_root(); + + if (rnp->exp_kworker) + return; + + kworker = kthread_create_worker(0, name, rnp_index); + if (IS_ERR_OR_NULL(kworker)) { + pr_err("Failed to create par gp kworker on %d/%d\n", + rnp->grplo, rnp->grphi); + return; + } + WRITE_ONCE(rnp->exp_kworker, kworker); + + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); +} + +static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) +{ + struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker); + + if (!kworker) + return NULL; + + return kworker->task; +} + +static void __init rcu_start_exp_gp_kworker(void) +{ + const char *name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", name); + rcu_exp_gp_kworker = NULL; + return; + } + + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); +} + +static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp) +{ + if (rcu_scheduler_fully_active) { + mutex_lock(&rnp->kthread_mutex); + rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_exp_par_gp_kworker(rnp); + mutex_unlock(&rnp->kthread_mutex); + } +} + /* * Invoked early in the CPU-online process, when pretty much all services * are available. The incoming CPU is not present. @@ -4442,7 +4511,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_rnp_kthreads(rnp); rcu_spawn_cpu_nocb_kthread(cpu); WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); @@ -4450,13 +4519,64 @@ int rcutree_prepare_cpu(unsigned int cpu) } /* - * Update RCU priority boot kthread affinity for CPU-hotplug changes. + * Update kthreads affinity during CPU-hotplug changes. + * + * Set the per-rcu_node kthread's affinity to cover all CPUs that are + * served by the rcu_node in question. The CPU hotplug lock is still + * held, so the value of rnp->qsmaskinit will be stable. + * + * We don't include outgoingcpu in the affinity set, use -1 if there is + * no outgoing CPU. If there are no CPUs left in the affinity set, + * this function allows the kthread to execute on any CPU. + * + * Any future concurrent calls are serialized via ->kthread_mutex. */ -static void rcutree_affinity_setting(unsigned int cpu, int outgoing) +static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) { - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + cpumask_var_t cm; + unsigned long mask; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct task_struct *task_boost, *task_exp; + + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + + task_boost = rcu_boost_task(rnp); + task_exp = rcu_exp_par_gp_task(rnp); + + /* + * If CPU is the boot one, those tasks are created later from early + * initcall since kthreadd must be created first. + */ + if (!task_boost && !task_exp) + return; + + if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) + return; + + mutex_lock(&rnp->kthread_mutex); + mask = rcu_rnp_online_cpus(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) + if ((mask & leaf_node_cpu_bit(rnp, cpu)) && + cpu != outgoingcpu) + cpumask_set_cpu(cpu, cm); + cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); + if (cpumask_empty(cm)) { + cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); + if (outgoingcpu >= 0) + cpumask_clear_cpu(outgoingcpu, cm); + } + + if (task_exp) + set_cpus_allowed_ptr(task_exp, cm); + + if (task_boost) + set_cpus_allowed_ptr(task_boost, cm); - rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); + mutex_unlock(&rnp->kthread_mutex); + + free_cpumask_var(cm); } /* @@ -4640,8 +4760,9 @@ void rcutree_migrate_callbacks(int cpu) __call_rcu_nocb_wake(my_rdp, true, flags); } else { rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags); + raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ } + local_irq_restore(flags); if (needwake) rcu_gp_kthread_wake(); lockdep_assert_irqs_enabled(); @@ -4730,51 +4851,6 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } -#ifdef CONFIG_RCU_EXP_KTHREAD -struct kthread_worker *rcu_exp_gp_kworker; -struct kthread_worker *rcu_exp_par_gp_kworker; - -static void __init rcu_start_exp_gp_kworkers(void) -{ - const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; - const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; - struct sched_param param = { .sched_priority = kthread_prio }; - - rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); - if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { - pr_err("Failed to create %s!\n", gp_kworker_name); - return; - } - - rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); - if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { - pr_err("Failed to create %s!\n", par_gp_kworker_name); - kthread_destroy_worker(rcu_exp_gp_kworker); - return; - } - - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); - sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, - ¶m); -} - -static inline void rcu_alloc_par_gp_wq(void) -{ -} -#else /* !CONFIG_RCU_EXP_KTHREAD */ -struct workqueue_struct *rcu_par_gp_wq; - -static void __init rcu_start_exp_gp_kworkers(void) -{ -} - -static inline void rcu_alloc_par_gp_wq(void) -{ - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); -} -#endif /* CONFIG_RCU_EXP_KTHREAD */ - /* * Spawn the kthreads that handle RCU's grace periods. */ @@ -4809,10 +4885,10 @@ static int __init rcu_spawn_gp_kthread(void) * due to rcu_scheduler_fully_active. */ rcu_spawn_cpu_nocb_kthread(smp_processor_id()); - rcu_spawn_one_boost_kthread(rdp->mynode); + rcu_spawn_rnp_kthreads(rdp->mynode); rcu_spawn_core_kthreads(); /* Create kthread worker for expedited GPs */ - rcu_start_exp_gp_kworkers(); + rcu_start_exp_gp_kworker(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -4915,7 +4991,7 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[2]); init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); - mutex_init(&rnp->boost_kthread_mutex); + mutex_init(&rnp->kthread_mutex); raw_spin_lock_init(&rnp->exp_poll_lock); rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp); @@ -5152,7 +5228,6 @@ void __init rcu_init(void) /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_alloc_par_gp_wq(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ @@ -5165,6 +5240,8 @@ void __init rcu_init(void) (void)start_poll_synchronize_rcu_expedited(); rcu_test_sync_prims(); + + tasks_cblist_init_generic(); } #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e9821a8422db..df48160b3136 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -21,14 +21,10 @@ #include "rcu_segcblist.h" -/* Communicate arguments to a workqueue handler. */ +/* Communicate arguments to a kthread worker handler. */ struct rcu_exp_work { unsigned long rew_s; -#ifdef CONFIG_RCU_EXP_KTHREAD struct kthread_work rew_work; -#else - struct work_struct rew_work; -#endif /* CONFIG_RCU_EXP_KTHREAD */ }; /* RCU's kthread states for tracing. */ @@ -72,6 +68,9 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + struct kthread_worker *exp_kworker; + /* Workers performing per node expedited GP */ + /* initialization. */ unsigned long cbovldmask; /* CPUs experiencing callback overload. */ unsigned long ffmask; /* Fully functional CPUs. */ @@ -113,7 +112,7 @@ struct rcu_node { /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ - struct mutex boost_kthread_mutex; + struct mutex kthread_mutex; /* Exclusion for thread spawning and affinity */ /* manipulation. */ struct task_struct *boost_kthread_task; @@ -467,11 +466,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp); static bool wake_nocb_gp(struct rcu_data *rdp, bool force); static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j, bool lazy); -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags, - bool lazy); -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, - unsigned long flags); +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy); +static void __maybe_unused __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 2ac440bc7e10..6b83537480b1 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -198,10 +198,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ + if (wake) swake_up_one_online(&rcu_state.expedited_wq); - } + break; } mask = rnp->grpmask; @@ -419,7 +418,6 @@ retry_ipi: static void rcu_exp_sel_wait_wake(unsigned long s); -#ifdef CONFIG_RCU_EXP_KTHREAD static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) { struct rcu_exp_work *rewp = @@ -428,9 +426,14 @@ static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) __sync_rcu_exp_select_node_cpus(rewp); } -static inline bool rcu_gp_par_worker_started(void) +static inline bool rcu_exp_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_gp_kworker); +} + +static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp) { - return !!READ_ONCE(rcu_exp_par_gp_kworker); + return !!READ_ONCE(rnp->exp_kworker); } static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) @@ -441,7 +444,7 @@ static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) * another work item on the same kthread worker can result in * deadlock. */ - kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); + kthread_queue_work(READ_ONCE(rnp->exp_kworker), &rnp->rew.rew_work); } static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) @@ -466,64 +469,6 @@ static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); } -static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) -{ -} -#else /* !CONFIG_RCU_EXP_KTHREAD */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) -{ - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); - - __sync_rcu_exp_select_node_cpus(rewp); -} - -static inline bool rcu_gp_par_worker_started(void) -{ - return !!READ_ONCE(rcu_par_gp_wq); -} - -static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) -{ - int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); -} - -static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) -{ - flush_work(&rnp->rew.rew_work); -} - -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) -{ - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_s); -} - -static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) -{ - INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew->rew_work); -} - -static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) -{ - destroy_work_on_stack(&rew->rew_work); -} -#endif /* CONFIG_RCU_EXP_KTHREAD */ - /* * Select the nodes that the upcoming expedited grace period needs * to wait for. @@ -541,7 +486,7 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!rcu_gp_par_worker_started() || + if (!rcu_exp_par_worker_started(rnp) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { /* No worker started yet or last leaf, do direct call. */ @@ -956,7 +901,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); unsigned long flags; struct rcu_exp_work rew; struct rcu_node *rnp; @@ -996,7 +940,7 @@ void synchronize_rcu_expedited(void) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ - if (unlikely(boottime)) { + if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) { /* Direct call during scheduler init and early_initcalls(). */ rcu_exp_sel_wait_wake(s); } else { @@ -1013,9 +957,6 @@ void synchronize_rcu_expedited(void) /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); - - if (likely(!boottime)) - synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 4efbf7333d4e..3f85577bddd4 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -256,6 +256,7 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force) return __wake_nocb_gp(rdp_gp, rdp, force, flags); } +#ifdef CONFIG_RCU_LAZY /* * LAZY_FLUSH_JIFFIES decides the maximum amount of time that * can elapse before lazy callbacks are flushed. Lazy callbacks @@ -264,21 +265,20 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force) * left unsubmitted to RCU after those many jiffies. */ #define LAZY_FLUSH_JIFFIES (10 * HZ) -static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES; +static unsigned long jiffies_lazy_flush = LAZY_FLUSH_JIFFIES; -#ifdef CONFIG_RCU_LAZY // To be called only from test code. -void rcu_lazy_set_jiffies_till_flush(unsigned long jif) +void rcu_set_jiffies_lazy_flush(unsigned long jif) { - jiffies_till_flush = jif; + jiffies_lazy_flush = jif; } -EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush); +EXPORT_SYMBOL(rcu_set_jiffies_lazy_flush); -unsigned long rcu_lazy_get_jiffies_till_flush(void) +unsigned long rcu_get_jiffies_lazy_flush(void) { - return jiffies_till_flush; + return jiffies_lazy_flush; } -EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush); +EXPORT_SYMBOL(rcu_get_jiffies_lazy_flush); #endif /* @@ -299,7 +299,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, */ if (waketype == RCU_NOCB_WAKE_LAZY && rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { - mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush); + mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush()); WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); } else if (waketype == RCU_NOCB_WAKE_BYPASS) { mod_timer(&rdp_gp->nocb_timer, jiffies + 2); @@ -482,7 +482,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, // flush ->nocb_bypass to ->cblist. if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) || (ncbs && bypass_is_lazy && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) || + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) || ncbs >= qhimark) { rcu_nocb_lock(rdp); *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); @@ -532,9 +532,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, // 2. Both of these conditions are met: // a. The bypass list previously had only lazy CBs, and: // b. The new CB is non-lazy. - if (ncbs && (!bypass_is_lazy || lazy)) { - local_irq_restore(flags); - } else { + if (!ncbs || (bypass_is_lazy && !lazy)) { // No-CBs GP kthread might be indefinitely asleep, if so, wake. rcu_nocb_lock(rdp); // Rare during call_rcu() flood. if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { @@ -544,7 +542,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, } else { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQnoWake")); - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); } } return true; // Callback already enqueued. @@ -566,11 +564,12 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, long lazy_len; long len; struct task_struct *t; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; // If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); if (rcu_nocb_poll || !t) { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); return; @@ -583,17 +582,17 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rdp->qlen_last_fqs_check = len; // Only lazy CBs in bypass list if (lazy_len && bypass_len == lazy_len) { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); } else if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); } @@ -610,20 +609,32 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, smp_mb(); /* Enqueue before timer_pending(). */ if ((rdp->nocb_cb_sleep || !rcu_segcblist_ready_cbs(&rdp->cblist)) && - !timer_pending(&rdp->nocb_timer)) { - rcu_nocb_unlock_irqrestore(rdp, flags); + !timer_pending(&rdp_gp->nocb_timer)) { + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } } +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy) +{ + bool was_alldone; + + if (!rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) { + /* Not enqueued on bypass but locked, do regular enqueue */ + rcutree_enqueue(rdp, head, func); + __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ + } +} + static int nocb_gp_toggle_rdp(struct rcu_data *rdp, bool *wake_state) { @@ -723,7 +734,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) lazy_ncbs = READ_ONCE(rdp->lazy_len); if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) || + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) || bypass_ncbs > 2 * qhimark)) { flush_bypass = true; } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) && @@ -779,7 +790,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) if (rcu_segcblist_ready_cbs(&rdp->cblist)) { needwake = rdp->nocb_cb_sleep; WRITE_ONCE(rdp->nocb_cb_sleep, false); - smp_mb(); /* CB invocation -after- GP end. */ } else { needwake = false; } @@ -933,8 +943,7 @@ static void nocb_cb_wait(struct rcu_data *rdp) swait_event_interruptible_exclusive(rdp->nocb_cb_wq, nocb_cb_wait_cond(rdp)); - // VVV Ensure CB invocation follows _sleep test. - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ + if (READ_ONCE(rdp->nocb_cb_sleep)) { WARN_ON(signal_pending(current)); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); } @@ -1383,7 +1392,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) rcu_nocb_unlock_irqrestore(rdp, flags); continue; } - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); + rcu_nocb_try_flush_bypass(rdp, jiffies); rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp(rdp, false); sc->nr_to_scan -= _count; @@ -1768,10 +1777,10 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, return true; } -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags, bool lazy) +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy) { - return false; + WARN_ON_ONCE(1); /* Should be dead code! */ } static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 41021080ad25..36a8b5dbf5b5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1195,14 +1195,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct sched_param sp; struct task_struct *t; - mutex_lock(&rnp->boost_kthread_mutex); - if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) - goto out; + if (rnp->boost_kthread_task) + return; t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); if (WARN_ON_ONCE(IS_ERR(t))) - goto out; + return; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; @@ -1210,48 +1209,11 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - - out: - mutex_unlock(&rnp->boost_kthread_mutex); } -/* - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - * - * Any future concurrent calls are serialized via ->boost_kthread_mutex. - */ -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static struct task_struct *rcu_boost_task(struct rcu_node *rnp) { - struct task_struct *t = rnp->boost_kthread_task; - unsigned long mask; - cpumask_var_t cm; - int cpu; - - if (!t) - return; - if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) - return; - mutex_lock(&rnp->boost_kthread_mutex); - mask = rcu_rnp_online_cpus(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) - if ((mask & leaf_node_cpu_bit(rnp, cpu)) && - cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (cpumask_empty(cm)) { - cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (outgoingcpu >= 0) - cpumask_clear_cpu(outgoingcpu, cm); - } - set_cpus_allowed_ptr(t, cm); - mutex_unlock(&rnp->boost_kthread_mutex); - free_cpumask_var(cm); + return READ_ONCE(rnp->boost_kthread_task); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1270,10 +1232,10 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { } -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static struct task_struct *rcu_boost_task(struct rcu_node *rnp) { + return NULL; } - #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9116bcc90346..540f229700b6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3955,6 +3955,17 @@ void wake_up_if_idle(int cpu) } } +bool cpus_equal_capacity(int this_cpu, int that_cpu) +{ + if (!sched_asym_cpucap_active()) + return true; + + if (this_cpu == that_cpu) + return true; + + return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu); +} + bool cpus_share_cache(int this_cpu, int that_cpu) { if (this_cpu == that_cpu) @@ -6787,10 +6798,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { + if (tsk->flags & PF_BLOCK_TS) + blk_plug_invalidate_ts(tsk); if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); - else + else if (tsk->flags & PF_IO_WORKER) io_wq_worker_running(tsk); } } diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 2ad881d07752..4e715b9b278e 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -162,6 +162,9 @@ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ | MEMBARRIER_CMD_GET_REGISTRATIONS) +static DEFINE_MUTEX(membarrier_ipi_mutex); +#define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex) + static void ipi_mb(void *info) { smp_mb(); /* IPIs should be serializing but paranoid. */ @@ -259,6 +262,7 @@ static int membarrier_global_expedited(void) if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; + SERIALIZE_IPI(); cpus_read_lock(); rcu_read_lock(); for_each_online_cpu(cpu) { @@ -347,6 +351,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; + SERIALIZE_IPI(); cpus_read_lock(); if (cpu_id >= 0) { @@ -460,6 +465,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) * between threads which are users of @mm has its membarrier state * updated. */ + SERIALIZE_IPI(); cpus_read_lock(); rcu_read_lock(); for_each_online_cpu(cpu) { diff --git a/kernel/signal.c b/kernel/signal.c index c9c57d053ce4..bdca529f0f7b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -47,6 +47,7 @@ #include <linux/cgroup.h> #include <linux/audit.h> #include <linux/sysctl.h> +#include <uapi/linux/pidfd.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -1436,7 +1437,8 @@ void lockdep_assert_task_sighand_held(struct task_struct *task) #endif /* - * send signal info to all the members of a group + * send signal info to all the members of a thread group or to the + * individual thread if type == PIDTYPE_PID. */ int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) @@ -1478,7 +1480,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) return ret; } -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) +static int kill_pid_info_type(int sig, struct kernel_siginfo *info, + struct pid *pid, enum pid_type type) { int error = -ESRCH; struct task_struct *p; @@ -1487,11 +1490,10 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) - error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); + error = group_send_sig_info(sig, info, p, type); rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; - /* * The task was unhashed in between, try again. If it * is dead, pid_task() will return NULL, if we race with @@ -1500,6 +1502,11 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) } } +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) +{ + return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID); +} + static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) { int error; @@ -1898,16 +1905,19 @@ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, return send_sig_info(info.si_signo, &info, t); } -int kill_pgrp(struct pid *pid, int sig, int priv) +static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { int ret; - read_lock(&tasklist_lock); - ret = __kill_pgrp_info(sig, __si_special(priv), pid); + ret = __kill_pgrp_info(sig, info, pgrp); read_unlock(&tasklist_lock); - return ret; } + +int kill_pgrp(struct pid *pid, int sig, int priv) +{ + return kill_pgrp_info(sig, __si_special(priv), pid); +} EXPORT_SYMBOL(kill_pgrp); int kill_pid(struct pid *pid, int sig, int priv) @@ -2019,13 +2029,14 @@ ret: return ret; } -static void do_notify_pidfd(struct task_struct *task) +void do_notify_pidfd(struct task_struct *task) { - struct pid *pid; + struct pid *pid = task_pid(task); WARN_ON(task->exit_state == 0); - pid = task_pid(task); - wake_up_all(&pid->wait_pidfd); + + __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0, + poll_to_key(EPOLLIN | EPOLLRDNORM)); } /* @@ -2050,9 +2061,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); - - /* Wake up all pidfd waiters */ - do_notify_pidfd(tsk); + /* + * tsk is a group leader and has no threads, wake up the + * non-PIDFD_THREAD waiters. + */ + if (thread_group_empty(tsk)) + do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* @@ -3789,12 +3803,13 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, #endif #endif -static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info) +static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info, + enum pid_type type) { clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; - info->si_code = SI_USER; + info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER; info->si_pid = task_tgid_vnr(current); info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); } @@ -3808,7 +3823,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; - prepare_kill_siginfo(sig, &info); + prepare_kill_siginfo(sig, &info, PIDTYPE_TGID); return kill_something_info(sig, &info, pid); } @@ -3861,6 +3876,10 @@ static struct pid *pidfd_to_pid(const struct file *file) return tgid_pidfd_to_pid(file); } +#define PIDFD_SEND_SIGNAL_FLAGS \ + (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ + PIDFD_SIGNAL_PROCESS_GROUP) + /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process @@ -3868,14 +3887,10 @@ static struct pid *pidfd_to_pid(const struct file *file) * @info: signal info * @flags: future flags * - * The syscall currently only signals via PIDTYPE_PID which covers - * kill(<positive-pid>, <signal>. It does not signal threads or process - * groups. - * In order to extend the syscall to threads and process groups the @flags - * argument should be used. In essence, the @flags argument will determine - * what is signaled and not the file descriptor itself. Put in other words, - * grouping is a property of the flags argument not a property of the file - * descriptor. + * Send the signal to the thread group or to the individual thread depending + * on PIDFD_THREAD. + * In the future extension to @flags may be used to override the default scope + * of @pidfd. * * Return: 0 on success, negative errno on failure */ @@ -3886,9 +3901,14 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, struct fd f; struct pid *pid; kernel_siginfo_t kinfo; + enum pid_type type; /* Enforce flags be set to 0 until we add an extension. */ - if (flags) + if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) + return -EINVAL; + + /* Ensure that only a single signal scope determining flag is set. */ + if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; f = fdget(pidfd); @@ -3906,6 +3926,25 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (!access_pidfd_pidns(pid)) goto err; + switch (flags) { + case 0: + /* Infer scope from the type of pidfd. */ + if (f.file->f_flags & PIDFD_THREAD) + type = PIDTYPE_PID; + else + type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_THREAD: + type = PIDTYPE_PID; + break; + case PIDFD_SIGNAL_THREAD_GROUP: + type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_PROCESS_GROUP: + type = PIDTYPE_PGID; + break; + } + if (info) { ret = copy_siginfo_from_user_any(&kinfo, info); if (unlikely(ret)) @@ -3917,15 +3956,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, /* Only allow sending arbitrary signals to yourself. */ ret = -EPERM; - if ((task_pid(current) != pid) && + if ((task_pid(current) != pid || type > PIDTYPE_TGID) && (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) goto err; } else { - prepare_kill_siginfo(sig, &kinfo); + prepare_kill_siginfo(sig, &kinfo, type); } - ret = kill_pid_info(sig, &kinfo, pid); - + if (type == PIDTYPE_PGID) + ret = kill_pgrp_info(sig, &kinfo, pid); + else + ret = kill_pid_info_type(sig, &kinfo, pid, type); err: fdput(f); return ret; @@ -3965,12 +4006,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) { struct kernel_siginfo info; - clear_siginfo(&info); - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = task_tgid_vnr(current); - info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + prepare_kill_siginfo(sig, &info, PIDTYPE_PID); return do_send_specific(tgid, pid, sig, &info); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 210cf5f8d92c..b315b21fb28c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -27,6 +27,7 @@ #include <linux/tick.h> #include <linux/irq.h> #include <linux/wait_bit.h> +#include <linux/workqueue.h> #include <asm/softirq_stack.h> @@ -802,11 +803,13 @@ static void tasklet_action_common(struct softirq_action *a, static __latent_entropy void tasklet_action(struct softirq_action *a) { + workqueue_softirq_action(false); tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); } static __latent_entropy void tasklet_hi_action(struct softirq_action *a) { + workqueue_softirq_action(true); tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } @@ -929,6 +932,8 @@ static void run_ksoftirqd(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU static int takeover_tasklets(unsigned int cpu) { + workqueue_softirq_dead(cpu); + /* CPU is dead, so no lock needed. */ local_irq_disable(); diff --git a/kernel/sys.c b/kernel/sys.c index e219fcfa112d..f8e543f1e38a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1785,21 +1785,24 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; u64 tgutime, tgstime, utime, stime; - unsigned long maxrss = 0; + unsigned long maxrss; + struct mm_struct *mm; struct signal_struct *sig = p->signal; + unsigned int seq = 0; - memset((char *)r, 0, sizeof (*r)); +retry: + memset(r, 0, sizeof(*r)); utime = stime = 0; + maxrss = 0; if (who == RUSAGE_THREAD) { task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = sig->maxrss; - goto out; + goto out_thread; } - if (!lock_task_sighand(p, &flags)) - return; + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); switch (who) { case RUSAGE_BOTH: @@ -1819,9 +1822,6 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) fallthrough; case RUSAGE_SELF: - thread_group_cputime_adjusted(p, &tgutime, &tgstime); - utime += tgutime; - stime += tgstime; r->ru_nvcsw += sig->nvcsw; r->ru_nivcsw += sig->nivcsw; r->ru_minflt += sig->min_flt; @@ -1830,28 +1830,42 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_oublock += sig->oublock; if (maxrss < sig->maxrss) maxrss = sig->maxrss; + + rcu_read_lock(); __for_each_thread(sig, t) accumulate_thread_rusage(t, r); + rcu_read_unlock(); + break; default: BUG(); } - unlock_task_sighand(p, &flags); -out: - r->ru_utime = ns_to_kernel_old_timeval(utime); - r->ru_stime = ns_to_kernel_old_timeval(stime); + if (need_seqretry(&sig->stats_lock, seq)) { + seq = 1; + goto retry; + } + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - if (who != RUSAGE_CHILDREN) { - struct mm_struct *mm = get_task_mm(p); + if (who == RUSAGE_CHILDREN) + goto out_children; - if (mm) { - setmax_mm_hiwater_rss(&maxrss, mm); - mmput(mm); - } + thread_group_cputime_adjusted(p, &tgutime, &tgstime); + utime += tgutime; + stime += tgstime; + +out_thread: + mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); } + +out_children: r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ + r->ru_utime = ns_to_kernel_old_timeval(utime); + r->ru_stime = ns_to_kernel_old_timeval(stime); } SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 760793998cdd..edb0f821dcea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, enum hrtimer_mode mode) { debug_activate(timer, mode); + WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; @@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->online = 1; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } @@ -2250,6 +2252,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); + old_base->online = 0; raw_spin_unlock(&old_base->lock); return 0; diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c index ca058c8af6ba..3e5d422dd15c 100644 --- a/kernel/time/time_test.c +++ b/kernel/time/time_test.c @@ -73,7 +73,7 @@ static void time64_to_tm_test_date_range(struct kunit *test) days = div_s64(secs, 86400); - #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %ld", \ + #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %lld", \ year, month, mdday, yday, days KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG); diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 6cd2a4e3afb8..9ff018245840 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -189,9 +189,6 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) { int size; - if (num <= 0) - return -EINVAL; - if (!fp->exit_handler) { fp->rethook = NULL; return 0; @@ -199,15 +196,16 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) /* Initialize rethook if needed */ if (fp->nr_maxactive) - size = fp->nr_maxactive; + num = fp->nr_maxactive; else - size = num * num_possible_cpus() * 2; - if (size <= 0) + num *= num_possible_cpus() * 2; + if (num <= 0) return -EINVAL; + size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size; + /* Initialize rethook */ - fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, - sizeof(struct fprobe_rethook_node), size); + fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num); if (IS_ERR(fp->rethook)) return PTR_ERR(fp->rethook); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b01ae7d36021..83ba342aef31 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5325,7 +5325,17 @@ static LIST_HEAD(ftrace_direct_funcs); static int register_ftrace_function_nolock(struct ftrace_ops *ops); +/* + * If there are multiple ftrace_ops, use SAVE_REGS by default, so that direct + * call will be jumped from ftrace_regs_caller. Only if the architecture does + * not support ftrace_regs_caller but direct_call, use SAVE_ARGS so that it + * jumps from ftrace_caller for multiple ftrace_ops. + */ +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS #define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS) +#else +#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS) +#endif static int check_direct_multi(struct ftrace_ops *ops) { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fd4bfe3ecf01..aa332ace108b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -384,7 +384,6 @@ struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; - long wait_index; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -756,8 +755,19 @@ static void rb_wake_up_waiters(struct irq_work *work) wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { + /* Only cpu_buffer sets the above flags */ + struct ring_buffer_per_cpu *cpu_buffer = + container_of(rbwork, struct ring_buffer_per_cpu, irq_work); + + /* Called from interrupt context */ + raw_spin_lock(&cpu_buffer->reader_lock); rbwork->wakeup_full = false; rbwork->full_waiters_pending = false; + + /* Waking up all waiters, they will reset the shortest full */ + cpu_buffer->shortest_full = 0; + raw_spin_unlock(&cpu_buffer->reader_lock); + wake_up_all(&rbwork->full_waiters); } } @@ -798,14 +808,40 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) rbwork = &cpu_buffer->irq_work; } - rbwork->wait_index++; - /* make sure the waiters see the new index */ - smp_wmb(); - /* This can be called in any context */ irq_work_queue(&rbwork->work); } +static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer; + bool ret = false; + + /* Reads of all CPUs always waits for any data */ + if (cpu == RING_BUFFER_ALL_CPUS) + return !ring_buffer_empty(buffer); + + cpu_buffer = buffer->buffers[cpu]; + + if (!ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; + + if (!full) + return true; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + ret = !pagebusy && full_hit(buffer, cpu, full); + + if (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full) + cpu_buffer->shortest_full = full; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + } + return ret; +} + /** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on @@ -821,7 +857,6 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); struct rb_irq_work *work; - long wait_index; int ret = 0; /* @@ -840,81 +875,54 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) work = &cpu_buffer->irq_work; } - wait_index = READ_ONCE(work->wait_index); - - while (true) { - if (full) - prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); - else - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); - - /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. - */ - if (full) - work->full_waiters_pending = true; - else - work->waiters_pending = true; - - if (signal_pending(current)) { - ret = -EINTR; - break; - } - - if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) - break; - - if (cpu != RING_BUFFER_ALL_CPUS && - !ring_buffer_empty_cpu(buffer, cpu)) { - unsigned long flags; - bool pagebusy; - bool done; - - if (!full) - break; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - done = !pagebusy && full_hit(buffer, cpu, full); + if (full) + prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); + else + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); - if (!cpu_buffer->shortest_full || - cpu_buffer->shortest_full > full) - cpu_buffer->shortest_full = full; - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (done) - break; - } + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + if (full) + work->full_waiters_pending = true; + else + work->waiters_pending = true; - schedule(); + if (rb_watermark_hit(buffer, cpu, full)) + goto out; - /* Make sure to see the new wait index */ - smp_rmb(); - if (wait_index != work->wait_index) - break; + if (signal_pending(current)) { + ret = -EINTR; + goto out; } + schedule(); + out: if (full) finish_wait(&work->full_waiters, &wait); else finish_wait(&work->waiters, &wait); + if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current)) + ret = -EINTR; + return ret; } @@ -937,28 +945,33 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table, int full) { struct ring_buffer_per_cpu *cpu_buffer; - struct rb_irq_work *work; + struct rb_irq_work *rbwork; if (cpu == RING_BUFFER_ALL_CPUS) { - work = &buffer->irq_work; + rbwork = &buffer->irq_work; full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return EPOLLERR; cpu_buffer = buffer->buffers[cpu]; - work = &cpu_buffer->irq_work; + rbwork = &cpu_buffer->irq_work; } if (full) { - poll_wait(filp, &work->full_waiters, poll_table); - work->full_waiters_pending = true; + unsigned long flags; + + poll_wait(filp, &rbwork->full_waiters, poll_table); + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rbwork->full_waiters_pending = true; if (!cpu_buffer->shortest_full || cpu_buffer->shortest_full > full) cpu_buffer->shortest_full = full; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } else { - poll_wait(filp, &work->waiters, poll_table); - work->waiters_pending = true; + poll_wait(filp, &rbwork->waiters, poll_table); + rbwork->waiters_pending = true; } /* @@ -5877,6 +5890,10 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) if (psize <= BUF_PAGE_HDR_SIZE) return -EINVAL; + /* Size of a subbuf cannot be greater than the write counter */ + if (psize > RB_WRITE_MASK + 1) + return -EINVAL; + old_order = buffer->subbuf_order; old_size = buffer->subbuf_size; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a7c6fd934e9..c9c898307348 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -39,6 +39,7 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/panic_notifier.h> +#include <linux/kmemleak.h> #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> @@ -1532,7 +1533,7 @@ void disable_trace_on_warning(void) bool tracer_tracing_is_on(struct trace_array *tr) { if (tr->array_buffer.buffer) - return ring_buffer_record_is_on(tr->array_buffer.buffer); + return ring_buffer_record_is_set_on(tr->array_buffer.buffer); return !tr->buffer_disabled; } @@ -2320,7 +2321,7 @@ struct saved_cmdlines_buffer { unsigned *map_cmdline_to_pid; unsigned cmdline_num; int cmdline_idx; - char *saved_cmdlines; + char saved_cmdlines[]; }; static struct saved_cmdlines_buffer *savedcmd; @@ -2334,47 +2335,60 @@ static inline void set_cmdline(int idx, const char *cmdline) strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } -static int allocate_cmdlines_buffer(unsigned int val, - struct saved_cmdlines_buffer *s) +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); + + kfree(s->map_cmdline_to_pid); + kmemleak_free(s); + free_pages((unsigned long)s, order); +} + +static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) { + struct saved_cmdlines_buffer *s; + struct page *page; + int orig_size, size; + int order; + + /* Figure out how much is needed to hold the given number of cmdlines */ + orig_size = sizeof(*s) + val * TASK_COMM_LEN; + order = get_order(orig_size); + size = 1 << (order + PAGE_SHIFT); + page = alloc_pages(GFP_KERNEL, order); + if (!page) + return NULL; + + s = page_address(page); + kmemleak_alloc(s, size, 1, GFP_KERNEL); + memset(s, 0, sizeof(*s)); + + /* Round up to actual allocation */ + val = (size - sizeof(*s)) / TASK_COMM_LEN; + s->cmdline_num = val; + s->map_cmdline_to_pid = kmalloc_array(val, sizeof(*s->map_cmdline_to_pid), GFP_KERNEL); - if (!s->map_cmdline_to_pid) - return -ENOMEM; - - s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL); - if (!s->saved_cmdlines) { - kfree(s->map_cmdline_to_pid); - return -ENOMEM; + if (!s->map_cmdline_to_pid) { + free_saved_cmdlines_buffer(s); + return NULL; } s->cmdline_idx = 0; - s->cmdline_num = val; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); - return 0; + return s; } static int trace_create_savedcmd(void) { - int ret; - - savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); - if (!savedcmd) - return -ENOMEM; + savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); - ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); - if (ret < 0) { - kfree(savedcmd); - savedcmd = NULL; - return -ENOMEM; - } - - return 0; + return savedcmd ? 0 : -ENOMEM; } int is_tracing_stopped(void) @@ -6056,26 +6070,14 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) -{ - kfree(s->saved_cmdlines); - kfree(s->map_cmdline_to_pid); - kfree(s); -} - static int tracing_resize_saved_cmdlines(unsigned int val) { struct saved_cmdlines_buffer *s, *savedcmd_temp; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = allocate_cmdlines_buffer(val); if (!s) return -ENOMEM; - if (allocate_cmdlines_buffer(val, s) < 0) { - kfree(s); - return -ENOMEM; - } - preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; @@ -7291,6 +7293,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) return 0; } +#define TRACE_MARKER_MAX_SIZE 4096 + static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -7318,6 +7322,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if ((ssize_t)cnt < 0) return -EINVAL; + if (cnt > TRACE_MARKER_MAX_SIZE) + cnt = TRACE_MARKER_MAX_SIZE; + meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ again: size = cnt + meta_size; @@ -7326,11 +7333,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt < FAULTED_SIZE) size += FAULTED_SIZE - cnt; - if (size > TRACE_SEQ_BUFFER_SIZE) { - cnt -= size - TRACE_SEQ_BUFFER_SIZE; - goto again; - } - buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); @@ -8391,6 +8393,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return size; } +static int tracing_buffers_flush(struct file *file, fl_owner_t id) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + + return 0; +} + static int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; @@ -8402,12 +8418,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); - iter->wait_index++; - /* Make sure the waiters see the new wait_index */ - smp_wmb(); - - ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); - if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); @@ -8623,6 +8633,7 @@ static const struct file_operations tracing_buffers_fops = { .read = tracing_buffers_read, .poll = tracing_buffers_poll, .release = tracing_buffers_release, + .flush = tracing_buffers_flush, .splice_read = tracing_buffers_splice_read, .unlocked_ioctl = tracing_buffers_ioctl, .llseek = no_llseek, diff --git a/kernel/trace/trace_btf.c b/kernel/trace/trace_btf.c index ca224d53bfdc..5bbdbcbbde3c 100644 --- a/kernel/trace/trace_btf.c +++ b/kernel/trace/trace_btf.c @@ -91,8 +91,8 @@ retry: for_each_member(i, type, member) { if (!member->name_off) { /* Anonymous union/struct: push it for later use */ - type = btf_type_skip_modifiers(btf, member->type, &tid); - if (type && top < BTF_ANON_STACK_MAX) { + if (btf_type_skip_modifiers(btf, member->type, &tid) && + top < BTF_ANON_STACK_MAX) { anon_stack[top].tid = tid; anon_stack[top++].offset = cur_offset + member->offset; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index e7af286af4f1..c82b401a294d 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -441,8 +441,9 @@ static unsigned int trace_string(struct synth_trace_event *entry, if (is_dynamic) { union trace_synth_field *data = &entry->fields[*n_u64]; + len = fetch_store_strlen((unsigned long)str_val); data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size; - data->as_dynamic.len = fetch_store_strlen((unsigned long)str_val); + data->as_dynamic.len = len; ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3e7fa44dc2b2..d8b302d01083 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1587,12 +1587,11 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, { struct print_entry *field; struct trace_seq *s = &iter->seq; - int max = iter->ent_size - offsetof(struct print_entry, buf); trace_assign_type(field, iter->ent); seq_print_ip_sym(s, field->ip, flags); - trace_seq_printf(s, ": %.*s", max, field->buf); + trace_seq_printf(s, ": %s", field->buf); return trace_handle_return(s); } @@ -1601,11 +1600,10 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, struct trace_event *event) { struct print_entry *field; - int max = iter->ent_size - offsetof(struct print_entry, buf); trace_assign_type(field, iter->ent); - trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf); + trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf); return trace_handle_return(&iter->seq); } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 4dc74d73fc1d..34289f9c6707 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1159,9 +1159,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (!(ctx->flags & TPARG_FL_TEVENT) && (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || strncmp(arg, "\\\"", 2) == 0)) { - /* The type of $comm must be "string", and not an array. */ - if (parg->count || (t && strcmp(t, "string"))) + /* The type of $comm must be "string", and not an array type. */ + if (parg->count || (t && strcmp(t, "string"))) { + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), + NEED_STRING_TYPE); goto out; + } parg->type = find_fetch_type("string", ctx->flags); } else parg->type = find_fetch_type(t, ctx->flags); @@ -1169,18 +1172,6 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE); goto out; } - parg->offset = *size; - *size += parg->type->size * (parg->count ?: 1); - - ret = -ENOMEM; - if (parg->count) { - len = strlen(parg->type->fmttype) + 6; - parg->fmt = kmalloc(len, GFP_KERNEL); - if (!parg->fmt) - goto out; - snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, - parg->count); - } code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); if (!code) @@ -1204,6 +1195,19 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, goto fail; } } + parg->offset = *size; + *size += parg->type->size * (parg->count ?: 1); + + if (parg->count) { + len = strlen(parg->type->fmttype) + 6; + parg->fmt = kmalloc(len, GFP_KERNEL); + if (!parg->fmt) { + ret = -ENOMEM; + goto out; + } + snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, + parg->count); + } ret = -EINVAL; /* Store operation */ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 850d9ecb6765..c1877d018269 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -515,7 +515,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \ C(NO_BTF_FIELD, "This field is not found."), \ C(BAD_BTF_TID, "Failed to get BTF type info."),\ - C(BAD_TYPE4STR, "This type does not fit for string."), + C(BAD_TYPE4STR, "This type does not fit for string."),\ + C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 76e60faed892..a60eb65955e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -29,6 +29,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> +#include <linux/interrupt.h> #include <linux/signal.h> #include <linux/completion.h> #include <linux/workqueue.h> @@ -53,10 +54,11 @@ #include <linux/nmi.h> #include <linux/kvm_para.h> #include <linux/delay.h> +#include <linux/irq_work.h> #include "workqueue_internal.h" -enum { +enum worker_pool_flags { /* * worker_pool flags * @@ -72,10 +74,17 @@ enum { * Note that DISASSOCIATED should be flipped only while holding * wq_pool_attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. + * + * As there can only be one concurrent BH execution context per CPU, a + * BH pool is per-CPU and always DISASSOCIATED. */ - POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ + POOL_BH = 1 << 0, /* is a BH pool */ + POOL_MANAGER_ACTIVE = 1 << 1, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ + POOL_BH_DRAINING = 1 << 3, /* draining after CPU offline */ +}; +enum worker_flags { /* worker flags */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ @@ -86,7 +95,13 @@ enum { WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | WORKER_UNBOUND | WORKER_REBOUND, +}; +enum work_cancel_flags { + WORK_CANCEL_DELAYED = 1 << 0, /* canceling a delayed_work */ +}; + +enum wq_internal_consts { NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ @@ -108,10 +123,18 @@ enum { RESCUER_NICE_LEVEL = MIN_NICE, HIGHPRI_NICE_LEVEL = MIN_NICE, - WQ_NAME_LEN = 24, + WQ_NAME_LEN = 32, }; /* + * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and + * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because + * msecs_to_jiffies() can't be an initializer. + */ +#define BH_WORKER_JIFFIES msecs_to_jiffies(2) +#define BH_WORKER_RESTARTS 10 + +/* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only for @@ -122,6 +145,9 @@ enum { * * L: pool->lock protected. Access with pool->lock held. * + * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for + * reads. + * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. @@ -143,6 +169,9 @@ enum { * * WR: wq->mutex protected for writes. RCU protected for reads. * + * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read + * with READ_ONCE() without locking. + * * MD: wq_mayday_lock protected. * * WD: Used internally by the watchdog. @@ -219,7 +248,7 @@ enum pool_workqueue_stats { }; /* - * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS + * The per-pool workqueue. While queued, bits below WORK_PWQ_SHIFT * of work_struct->data are used for flags and the remaining high bits * point to the pwq; thus, pwqs need to be aligned at two's power of the * number of flag bits. @@ -232,6 +261,7 @@ struct pool_workqueue { int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ + bool plugged; /* L: execution suspended */ /* * nr_active management and WORK_STRUCT_INACTIVE: @@ -240,18 +270,18 @@ struct pool_workqueue { * pwq->inactive_works instead of pool->worklist and marked with * WORK_STRUCT_INACTIVE. * - * All work items marked with WORK_STRUCT_INACTIVE do not participate - * in pwq->nr_active and all work items in pwq->inactive_works are - * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE - * work items are in pwq->inactive_works. Some of them are ready to - * run in pool->worklist or worker->scheduled. Those work itmes are - * only struct wq_barrier which is used for flush_work() and should - * not participate in pwq->nr_active. For non-barrier work item, it - * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. + * All work items marked with WORK_STRUCT_INACTIVE do not participate in + * nr_active and all work items in pwq->inactive_works are marked with + * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are + * in pwq->inactive_works. Some of them are ready to run in + * pool->worklist or worker->scheduled. Those work itmes are only struct + * wq_barrier which is used for flush_work() and should not participate + * in nr_active. For non-barrier work item, it is marked with + * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. */ int nr_active; /* L: nr of active works */ - int max_active; /* L: max active works */ struct list_head inactive_works; /* L: inactive works */ + struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ @@ -265,7 +295,7 @@ struct pool_workqueue { */ struct kthread_work release_work; struct rcu_head rcu; -} __aligned(1 << WORK_STRUCT_FLAG_BITS); +} __aligned(1 << WORK_STRUCT_PWQ_SHIFT); /* * Structure used to wait for workqueue flush. @@ -279,6 +309,26 @@ struct wq_flusher { struct wq_device; /* + * Unlike in a per-cpu workqueue where max_active limits its concurrency level + * on each CPU, in an unbound workqueue, max_active applies to the whole system. + * As sharing a single nr_active across multiple sockets can be very expensive, + * the counting and enforcement is per NUMA node. + * + * The following struct is used to enforce per-node max_active. When a pwq wants + * to start executing a work item, it should increment ->nr using + * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over + * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish + * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in + * round-robin order. + */ +struct wq_node_nr_active { + int max; /* per-node max_active */ + atomic_t nr; /* per-node nr_active */ + raw_spinlock_t lock; /* nests inside pool locks */ + struct list_head pending_pwqs; /* LN: pwqs with inactive works */ +}; + +/* * The externally visible workqueue. It relays the issued work items to * the appropriate worker_pool through its pool_workqueues. */ @@ -298,10 +348,15 @@ struct workqueue_struct { struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ - int saved_max_active; /* WQ: saved pwq max_active */ + + /* See alloc_workqueue() function comment for info on min/max_active */ + int max_active; /* WO: max active works */ + int min_active; /* WO: min active works */ + int saved_max_active; /* WQ: saved max_active */ + int saved_min_active; /* WQ: saved min_active */ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ - struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */ + struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ @@ -323,10 +378,9 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ + struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */ }; -static struct kmem_cache *pwq_cache; - /* * Each pod type describes how CPUs should be grouped for unbound workqueues. * See the comment above workqueue_attrs->affn_scope. @@ -338,16 +392,13 @@ struct wq_pod_type { int *cpu_pod; /* cpu -> pod */ }; -static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; -static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; - static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { - [WQ_AFFN_DFL] = "default", - [WQ_AFFN_CPU] = "cpu", - [WQ_AFFN_SMT] = "smt", - [WQ_AFFN_CACHE] = "cache", - [WQ_AFFN_NUMA] = "numa", - [WQ_AFFN_SYSTEM] = "system", + [WQ_AFFN_DFL] = "default", + [WQ_AFFN_CPU] = "cpu", + [WQ_AFFN_SMT] = "smt", + [WQ_AFFN_CACHE] = "cache", + [WQ_AFFN_NUMA] = "numa", + [WQ_AFFN_SYSTEM] = "system", }; /* @@ -359,12 +410,22 @@ static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { */ static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); +#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT +static unsigned int wq_cpu_intensive_warning_thresh = 4; +module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644); +#endif /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_online; /* can kworkers be created yet? */ +static bool wq_topo_initialized __read_mostly = false; + +static struct kmem_cache *pwq_cache; + +static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; +static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ static struct workqueue_attrs *wq_update_pod_attrs_buf; @@ -405,8 +466,17 @@ static bool wq_debug_force_rr_cpu = false; #endif module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); +/* to raise softirq for the BH worker pools on other CPUs */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], + bh_pool_irq_works); + +/* the BH worker pools */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + bh_worker_pools); + /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ @@ -420,6 +490,12 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; /* + * Used to synchronize multiple cancel_sync attempts on the same work item. See + * work_grab_pending() and __cancel_work_sync(). + */ +static DECLARE_WAIT_QUEUE_HEAD(wq_cancel_waitq); + +/* * I: kthread_worker to release pwq's. pwq release needs to be bounced to a * process context while holding a pool lock. Bounce to a dedicated kthread * worker to avoid A-A deadlocks. @@ -440,6 +516,10 @@ struct workqueue_struct *system_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_power_efficient_wq); struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); +struct workqueue_struct *system_bh_wq; +EXPORT_SYMBOL_GPL(system_bh_wq); +struct workqueue_struct *system_bh_highpri_wq; +EXPORT_SYMBOL_GPL(system_bh_highpri_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); @@ -450,16 +530,21 @@ static void show_one_worker_pool(struct worker_pool *pool); #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ !lockdep_is_held(&wq_pool_mutex), \ "RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ !lockdep_is_held(&wq->mutex) && \ !lockdep_is_held(&wq_pool_mutex), \ "RCU, wq->mutex or wq_pool_mutex should be held") +#define for_each_bh_worker_pool(pool, cpu) \ + for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \ + (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ + (pool)++) + #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -632,6 +717,36 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } +static struct pool_workqueue __rcu ** +unbound_pwq_slot(struct workqueue_struct *wq, int cpu) +{ + if (cpu >= 0) + return per_cpu_ptr(wq->cpu_pwq, cpu); + else + return &wq->dfl_pwq; +} + +/* @cpu < 0 for dfl_pwq */ +static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu) +{ + return rcu_dereference_check(*unbound_pwq_slot(wq, cpu), + lockdep_is_held(&wq_pool_mutex) || + lockdep_is_held(&wq->mutex)); +} + +/** + * unbound_effective_cpumask - effective cpumask of an unbound workqueue + * @wq: workqueue of interest + * + * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which + * is masked with wq_unbound_cpumask to determine the effective cpumask. The + * default pwq is always mapped to the pool with the current effective cpumask. + */ +static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq) +{ + return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask; +} + static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -653,10 +768,9 @@ static int work_next_color(int color) * contain the pointer to the queued pwq. Once execution starts, the flag * is cleared and the high bits contain OFFQ flags and pool ID. * - * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling() - * and clear_work_data() can be used to set the pwq, pool or clear - * work->data. These functions should only be called while the work is - * owned - ie. while the PENDING bit is set. + * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling() + * can be used to set the pwq, pool or clear work->data. These functions should + * only be called while the work is owned - ie. while the PENDING bit is set. * * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq * corresponding to a work. Pool is available once the work has been @@ -668,29 +782,28 @@ static int work_next_color(int color) * but stay off timer and worklist for arbitrarily long and nobody should * try to steal the PENDING bit. */ -static inline void set_work_data(struct work_struct *work, unsigned long data, - unsigned long flags) +static inline void set_work_data(struct work_struct *work, unsigned long data) { WARN_ON_ONCE(!work_pending(work)); - atomic_long_set(&work->data, data | flags | work_static(work)); + atomic_long_set(&work->data, data | work_static(work)); } static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, - unsigned long extra_flags) + unsigned long flags) { - set_work_data(work, (unsigned long)pwq, - WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags); + set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING | + WORK_STRUCT_PWQ | flags); } static void set_work_pool_and_keep_pending(struct work_struct *work, - int pool_id) + int pool_id, unsigned long flags) { - set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, - WORK_STRUCT_PENDING); + set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | + WORK_STRUCT_PENDING | flags); } static void set_work_pool_and_clear_pending(struct work_struct *work, - int pool_id) + int pool_id, unsigned long flags) { /* * The following wmb is paired with the implied mb in @@ -699,7 +812,8 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, * owner. */ smp_wmb(); - set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); + set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | + flags); /* * The following mb guarantees that previous clear of a PENDING bit * will not be reordered with any speculative LOADS or STORES from @@ -731,15 +845,9 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, smp_mb(); } -static void clear_work_data(struct work_struct *work) -{ - smp_wmb(); /* see set_work_pool_and_clear_pending() */ - set_work_data(work, WORK_STRUCT_NO_POOL, 0); -} - static inline struct pool_workqueue *work_struct_pwq(unsigned long data) { - return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK); + return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK); } static struct pool_workqueue *get_work_pwq(struct work_struct *work) @@ -806,7 +914,7 @@ static void mark_work_canceling(struct work_struct *work) unsigned long pool_id = get_work_pool_id(work); pool_id <<= WORK_OFFQ_POOL_SHIFT; - set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING); + set_work_data(work, pool_id | WORK_STRUCT_PENDING | WORK_OFFQ_CANCELING); } static bool work_is_canceling(struct work_struct *work) @@ -1101,6 +1209,29 @@ static bool assign_work(struct work_struct *work, struct worker *worker, return true; } +static struct irq_work *bh_pool_irq_work(struct worker_pool *pool) +{ + int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0; + + return &per_cpu(bh_pool_irq_works, pool->cpu)[high]; +} + +static void kick_bh_pool(struct worker_pool *pool) +{ +#ifdef CONFIG_SMP + /* see drain_dead_softirq_workfn() for BH_DRAINING */ + if (unlikely(pool->cpu != smp_processor_id() && + !(pool->flags & POOL_BH_DRAINING))) { + irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu); + return; + } +#endif + if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) + raise_softirq_irqoff(HI_SOFTIRQ); + else + raise_softirq_irqoff(TASKLET_SOFTIRQ); +} + /** * kick_pool - wake up an idle worker if necessary * @pool: pool to kick @@ -1118,6 +1249,11 @@ static bool kick_pool(struct worker_pool *pool) if (!need_more_worker(pool) || !worker) return false; + if (pool->flags & POOL_BH) { + kick_bh_pool(pool); + return true; + } + p = worker->task; #ifdef CONFIG_SMP @@ -1198,11 +1334,13 @@ restart: u64 cnt; /* - * Start reporting from the fourth time and back off + * Start reporting from the warning_thresh and back off * exponentially. */ cnt = atomic64_inc_return_relaxed(&ent->cnt); - if (cnt >= 4 && is_power_of_2(cnt)) + if (wq_cpu_intensive_warning_thresh && + cnt >= wq_cpu_intensive_warning_thresh && + is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh)) printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", ent->func, wq_cpu_intensive_thresh_us, atomic64_read(&ent->cnt)); @@ -1231,10 +1369,12 @@ restart: ent = &wci_ents[wci_nr_ents++]; ent->func = func; - atomic64_set(&ent->cnt, 1); + atomic64_set(&ent->cnt, 0); hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); raw_spin_unlock(&wci_lock); + + goto restart; } #else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ @@ -1402,6 +1542,74 @@ work_func_t wq_worker_last_func(struct task_struct *task) } /** + * wq_node_nr_active - Determine wq_node_nr_active to use + * @wq: workqueue of interest + * @node: NUMA node, can be %NUMA_NO_NODE + * + * Determine wq_node_nr_active to use for @wq on @node. Returns: + * + * - %NULL for per-cpu workqueues as they don't need to use shared nr_active. + * + * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE. + * + * - Otherwise, node_nr_active[@node]. + */ +static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, + int node) +{ + if (!(wq->flags & WQ_UNBOUND)) + return NULL; + + if (node == NUMA_NO_NODE) + node = nr_node_ids; + + return wq->node_nr_active[node]; +} + +/** + * wq_update_node_max_active - Update per-node max_actives to use + * @wq: workqueue to update + * @off_cpu: CPU that's going down, -1 if a CPU is not going down + * + * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is + * distributed among nodes according to the proportions of numbers of online + * cpus. The result is always between @wq->min_active and max_active. + */ +static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) +{ + struct cpumask *effective = unbound_effective_cpumask(wq); + int min_active = READ_ONCE(wq->min_active); + int max_active = READ_ONCE(wq->max_active); + int total_cpus, node; + + lockdep_assert_held(&wq->mutex); + + if (!wq_topo_initialized) + return; + + if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective)) + off_cpu = -1; + + total_cpus = cpumask_weight_and(effective, cpu_online_mask); + if (off_cpu >= 0) + total_cpus--; + + for_each_node(node) { + int node_cpus; + + node_cpus = cpumask_weight_and(effective, cpumask_of_node(node)); + if (off_cpu >= 0 && cpu_to_node(off_cpu) == node) + node_cpus--; + + wq_node_nr_active(wq, node)->max = + clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus), + min_active, max_active); + } + + wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active; +} + +/** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get * @@ -1453,24 +1661,336 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) } } -static void pwq_activate_inactive_work(struct work_struct *work) +static bool pwq_is_empty(struct pool_workqueue *pwq) { - struct pool_workqueue *pwq = get_work_pwq(work); + return !pwq->nr_active && list_empty(&pwq->inactive_works); +} +static void __pwq_activate_work(struct pool_workqueue *pwq, + struct work_struct *work) +{ + unsigned long *wdb = work_data_bits(work); + + WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE)); trace_workqueue_activate_work(work); if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); + __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb); +} + +/** + * pwq_activate_work - Activate a work item if inactive + * @pwq: pool_workqueue @work belongs to + * @work: work item to activate + * + * Returns %true if activated. %false if already active. + */ +static bool pwq_activate_work(struct pool_workqueue *pwq, + struct work_struct *work) +{ + struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna; + + lockdep_assert_held(&pool->lock); + + if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE)) + return false; + + nna = wq_node_nr_active(pwq->wq, pool->node); + if (nna) + atomic_inc(&nna->nr); + pwq->nr_active++; + __pwq_activate_work(pwq, work); + return true; +} + +static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) +{ + int max = READ_ONCE(nna->max); + + while (true) { + int old, tmp; + + old = atomic_read(&nna->nr); + if (old >= max) + return false; + tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1); + if (tmp == old) + return true; + } +} + +/** + * pwq_tryinc_nr_active - Try to increment nr_active for a pwq + * @pwq: pool_workqueue of interest + * @fill: max_active may have increased, try to increase concurrency level + * + * Try to increment nr_active for @pwq. Returns %true if an nr_active count is + * successfully obtained. %false otherwise. + */ +static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill) +{ + struct workqueue_struct *wq = pwq->wq; + struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); + bool obtained = false; + + lockdep_assert_held(&pool->lock); + + if (!nna) { + /* BH or per-cpu workqueue, pwq->nr_active is sufficient */ + obtained = pwq->nr_active < READ_ONCE(wq->max_active); + goto out; + } + + if (unlikely(pwq->plugged)) + return false; + + /* + * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is + * already waiting on $nna, pwq_dec_nr_active() will maintain the + * concurrency level. Don't jump the line. + * + * We need to ignore the pending test after max_active has increased as + * pwq_dec_nr_active() can only maintain the concurrency level but not + * increase it. This is indicated by @fill. + */ + if (!list_empty(&pwq->pending_node) && likely(!fill)) + goto out; + + obtained = tryinc_node_nr_active(nna); + if (obtained) + goto out; + + /* + * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs + * and try again. The smp_mb() is paired with the implied memory barrier + * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either + * we see the decremented $nna->nr or they see non-empty + * $nna->pending_pwqs. + */ + raw_spin_lock(&nna->lock); + + if (list_empty(&pwq->pending_node)) + list_add_tail(&pwq->pending_node, &nna->pending_pwqs); + else if (likely(!fill)) + goto out_unlock; + + smp_mb(); + + obtained = tryinc_node_nr_active(nna); + + /* + * If @fill, @pwq might have already been pending. Being spuriously + * pending in cold paths doesn't affect anything. Let's leave it be. + */ + if (obtained && likely(!fill)) + list_del_init(&pwq->pending_node); + +out_unlock: + raw_spin_unlock(&nna->lock); +out: + if (obtained) + pwq->nr_active++; + return obtained; +} + +/** + * pwq_activate_first_inactive - Activate the first inactive work item on a pwq + * @pwq: pool_workqueue of interest + * @fill: max_active may have increased, try to increase concurrency level + * + * Activate the first inactive work item of @pwq if available and allowed by + * max_active limit. + * + * Returns %true if an inactive work item has been activated. %false if no + * inactive work item is found or max_active limit is reached. + */ +static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill) +{ + struct work_struct *work = + list_first_entry_or_null(&pwq->inactive_works, + struct work_struct, entry); + + if (work && pwq_tryinc_nr_active(pwq, fill)) { + __pwq_activate_work(pwq, work); + return true; + } else { + return false; + } } -static void pwq_activate_first_inactive(struct pool_workqueue *pwq) +/** + * unplug_oldest_pwq - unplug the oldest pool_workqueue + * @wq: workqueue_struct where its oldest pwq is to be unplugged + * + * This function should only be called for ordered workqueues where only the + * oldest pwq is unplugged, the others are plugged to suspend execution to + * ensure proper work item ordering:: + * + * dfl_pwq --------------+ [P] - plugged + * | + * v + * pwqs -> A -> B [P] -> C [P] (newest) + * | | | + * 1 3 5 + * | | | + * 2 4 6 + * + * When the oldest pwq is drained and removed, this function should be called + * to unplug the next oldest one to start its work item execution. Note that + * pwq's are linked into wq->pwqs with the oldest first, so the first one in + * the list is the oldest. + */ +static void unplug_oldest_pwq(struct workqueue_struct *wq) { - struct work_struct *work = list_first_entry(&pwq->inactive_works, - struct work_struct, entry); + struct pool_workqueue *pwq; - pwq_activate_inactive_work(work); + lockdep_assert_held(&wq->mutex); + + /* Caller should make sure that pwqs isn't empty before calling */ + pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue, + pwqs_node); + raw_spin_lock_irq(&pwq->pool->lock); + if (pwq->plugged) { + pwq->plugged = false; + if (pwq_activate_first_inactive(pwq, true)) + kick_pool(pwq->pool); + } + raw_spin_unlock_irq(&pwq->pool->lock); +} + +/** + * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active + * @nna: wq_node_nr_active to activate a pending pwq for + * @caller_pool: worker_pool the caller is locking + * + * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked. + * @caller_pool may be unlocked and relocked to lock other worker_pools. + */ +static void node_activate_pending_pwq(struct wq_node_nr_active *nna, + struct worker_pool *caller_pool) +{ + struct worker_pool *locked_pool = caller_pool; + struct pool_workqueue *pwq; + struct work_struct *work; + + lockdep_assert_held(&caller_pool->lock); + + raw_spin_lock(&nna->lock); +retry: + pwq = list_first_entry_or_null(&nna->pending_pwqs, + struct pool_workqueue, pending_node); + if (!pwq) + goto out_unlock; + + /* + * If @pwq is for a different pool than @locked_pool, we need to lock + * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock + * / lock dance. For that, we also need to release @nna->lock as it's + * nested inside pool locks. + */ + if (pwq->pool != locked_pool) { + raw_spin_unlock(&locked_pool->lock); + locked_pool = pwq->pool; + if (!raw_spin_trylock(&locked_pool->lock)) { + raw_spin_unlock(&nna->lock); + raw_spin_lock(&locked_pool->lock); + raw_spin_lock(&nna->lock); + goto retry; + } + } + + /* + * $pwq may not have any inactive work items due to e.g. cancellations. + * Drop it from pending_pwqs and see if there's another one. + */ + work = list_first_entry_or_null(&pwq->inactive_works, + struct work_struct, entry); + if (!work) { + list_del_init(&pwq->pending_node); + goto retry; + } + + /* + * Acquire an nr_active count and activate the inactive work item. If + * $pwq still has inactive work items, rotate it to the end of the + * pending_pwqs so that we round-robin through them. This means that + * inactive work items are not activated in queueing order which is fine + * given that there has never been any ordering across different pwqs. + */ + if (likely(tryinc_node_nr_active(nna))) { + pwq->nr_active++; + __pwq_activate_work(pwq, work); + + if (list_empty(&pwq->inactive_works)) + list_del_init(&pwq->pending_node); + else + list_move_tail(&pwq->pending_node, &nna->pending_pwqs); + + /* if activating a foreign pool, make sure it's running */ + if (pwq->pool != caller_pool) + kick_pool(pwq->pool); + } + +out_unlock: + raw_spin_unlock(&nna->lock); + if (locked_pool != caller_pool) { + raw_spin_unlock(&locked_pool->lock); + raw_spin_lock(&caller_pool->lock); + } +} + +/** + * pwq_dec_nr_active - Retire an active count + * @pwq: pool_workqueue of interest + * + * Decrement @pwq's nr_active and try to activate the first inactive work item. + * For unbound workqueues, this function may temporarily drop @pwq->pool->lock. + */ +static void pwq_dec_nr_active(struct pool_workqueue *pwq) +{ + struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node); + + lockdep_assert_held(&pool->lock); + + /* + * @pwq->nr_active should be decremented for both percpu and unbound + * workqueues. + */ + pwq->nr_active--; + + /* + * For a percpu workqueue, it's simple. Just need to kick the first + * inactive work item on @pwq itself. + */ + if (!nna) { + pwq_activate_first_inactive(pwq, false); + return; + } + + /* + * If @pwq is for an unbound workqueue, it's more complicated because + * multiple pwqs and pools may be sharing the nr_active count. When a + * pwq needs to wait for an nr_active count, it puts itself on + * $nna->pending_pwqs. The following atomic_dec_return()'s implied + * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to + * guarantee that either we see non-empty pending_pwqs or they see + * decremented $nna->nr. + * + * $nna->max may change as CPUs come online/offline and @pwq->wq's + * max_active gets updated. However, it is guaranteed to be equal to or + * larger than @pwq->wq->min_active which is above zero unless freezing. + * This maintains the forward progress guarantee. + */ + if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max)) + return; + + if (!list_empty(&nna->pending_pwqs)) + node_activate_pending_pwq(nna, pool); } /** @@ -1481,6 +2001,11 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq) * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. * + * NOTE: + * For unbound workqueues, this function may temporarily drop @pwq->pool->lock + * and thus should be called after all other state updates for the in-flight + * work item is complete. + * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ @@ -1488,14 +2013,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ { int color = get_work_color(work_data); - if (!(work_data & WORK_STRUCT_INACTIVE)) { - pwq->nr_active--; - if (!list_empty(&pwq->inactive_works)) { - /* one down, submit an inactive one */ - if (pwq->nr_active < pwq->max_active) - pwq_activate_first_inactive(pwq); - } - } + if (!(work_data & WORK_STRUCT_INACTIVE)) + pwq_dec_nr_active(pwq); pwq->nr_in_flight[color]--; @@ -1523,8 +2042,8 @@ out_put: /** * try_to_grab_pending - steal work item from worklist and disable irq * @work: work item to steal - * @is_dwork: @work is a delayed_work - * @flags: place to store irq state + * @cflags: %WORK_CANCEL_ flags + * @irq_flags: place to store irq state * * Try to grab PENDING bit of @work. This function can handle @work in any * stable state - idle, on timer or on worklist. @@ -1546,20 +2065,20 @@ out_put: * irqsafe, ensures that we return -EAGAIN for finite short period of time. * * On successful return, >= 0, irq is disabled and the caller is - * responsible for releasing it using local_irq_restore(*@flags). + * responsible for releasing it using local_irq_restore(*@irq_flags). * * This function is safe to call from any context including IRQ handler. */ -static int try_to_grab_pending(struct work_struct *work, bool is_dwork, - unsigned long *flags) +static int try_to_grab_pending(struct work_struct *work, u32 cflags, + unsigned long *irq_flags) { struct worker_pool *pool; struct pool_workqueue *pwq; - local_irq_save(*flags); + local_irq_save(*irq_flags); /* try to steal the timer if it exists */ - if (is_dwork) { + if (cflags & WORK_CANCEL_DELAYED) { struct delayed_work *dwork = to_delayed_work(work); /* @@ -1595,6 +2114,8 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, */ pwq = get_work_pwq(work); if (pwq && pwq->pool == pool) { + unsigned long work_data; + debug_work_deactivate(work); /* @@ -1608,14 +2129,19 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, * management later on and cause stall. Make sure the work * item is activated before grabbing. */ - if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) - pwq_activate_inactive_work(work); + pwq_activate_work(pwq, work); list_del_init(&work->entry); - pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); - /* work->data points to pwq iff queued, point to pool */ - set_work_pool_and_keep_pending(work, pool->id); + /* + * work->data points to pwq iff queued. Let's point to pool. As + * this destroys work->data needed by the next step, stash it. + */ + work_data = *work_data_bits(work); + set_work_pool_and_keep_pending(work, pool->id, 0); + + /* must be the last step, see the function comment */ + pwq_dec_nr_in_flight(pwq, work_data); raw_spin_unlock(&pool->lock); rcu_read_unlock(); @@ -1624,13 +2150,82 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, raw_spin_unlock(&pool->lock); fail: rcu_read_unlock(); - local_irq_restore(*flags); + local_irq_restore(*irq_flags); if (work_is_canceling(work)) return -ENOENT; cpu_relax(); return -EAGAIN; } +struct cwt_wait { + wait_queue_entry_t wait; + struct work_struct *work; +}; + +static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +{ + struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); + + if (cwait->work != key) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +/** + * work_grab_pending - steal work item from worklist and disable irq + * @work: work item to steal + * @cflags: %WORK_CANCEL_ flags + * @irq_flags: place to store IRQ state + * + * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer + * or on worklist. + * + * Must be called in process context. IRQ is disabled on return with IRQ state + * stored in *@irq_flags. The caller is responsible for re-enabling it using + * local_irq_restore(). + * + * Returns %true if @work was pending. %false if idle. + */ +static bool work_grab_pending(struct work_struct *work, u32 cflags, + unsigned long *irq_flags) +{ + struct cwt_wait cwait; + int ret; + + might_sleep(); +repeat: + ret = try_to_grab_pending(work, cflags, irq_flags); + if (likely(ret >= 0)) + return ret; + if (ret != -ENOENT) + goto repeat; + + /* + * Someone is already canceling. Wait for it to finish. flush_work() + * doesn't work for PREEMPT_NONE because we may get woken up between + * @work's completion and the other canceling task resuming and clearing + * CANCELING - flush_work() will return false immediately as @work is no + * longer busy, try_to_grab_pending() will return -ENOENT as @work is + * still being canceled and the other canceling task won't be able to + * clear CANCELING as we're hogging the CPU. + * + * Let's wait for completion using a waitqueue. As this may lead to the + * thundering herd problem, use a custom wake function which matches + * @work along with exclusive wait and wakeup. + */ + init_wait(&cwait.wait); + cwait.wait.func = cwt_wakefn; + cwait.work = work; + + prepare_to_wait_exclusive(&wq_cancel_waitq, &cwait.wait, + TASK_UNINTERRUPTIBLE); + if (work_is_canceling(work)) + schedule(); + finish_wait(&wq_cancel_waitq, &cwait.wait); + + goto repeat; +} + /** * insert_work - insert a work into a pool * @pwq: pwq @work belongs to @@ -1718,7 +2313,6 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, */ lockdep_assert_irqs_disabled(); - /* * For a draining wq, only works from the same workqueue are * allowed. The __WQ_DESTROYING helps to spot the issue that @@ -1793,12 +2387,16 @@ retry: pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); - if (likely(pwq->nr_active < pwq->max_active)) { + /* + * Limit the number of concurrently active work items to max_active. + * @work must also queue behind existing inactive work items to maintain + * ordering when max_active changes. See wq_adjust_max_active(). + */ + if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) { if (list_empty(&pool->worklist)) pool->watchdog_ts = jiffies; trace_workqueue_activate_work(work); - pwq->nr_active++; insert_work(pwq, work, &pool->worklist, work_flags); kick_pool(pool); } else { @@ -1829,16 +2427,16 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { bool ret = false; - unsigned long flags; + unsigned long irq_flags; - local_irq_save(flags); + local_irq_save(irq_flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { __queue_work(cpu, wq, work); ret = true; } - local_irq_restore(flags); + local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL(queue_work_on); @@ -1895,7 +2493,7 @@ static int select_numa_node_cpu(int node) bool queue_work_node(int node, struct workqueue_struct *wq, struct work_struct *work) { - unsigned long flags; + unsigned long irq_flags; bool ret = false; /* @@ -1909,7 +2507,7 @@ bool queue_work_node(int node, struct workqueue_struct *wq, */ WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); - local_irq_save(flags); + local_irq_save(irq_flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { int cpu = select_numa_node_cpu(node); @@ -1918,7 +2516,7 @@ bool queue_work_node(int node, struct workqueue_struct *wq, ret = true; } - local_irq_restore(flags); + local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL_GPL(queue_work_node); @@ -1958,10 +2556,18 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, dwork->cpu = cpu; timer->expires = jiffies + delay; - if (unlikely(cpu != WORK_CPU_UNBOUND)) + if (housekeeping_enabled(HK_TYPE_TIMER)) { + /* If the current cpu is a housekeeping cpu, use it. */ + cpu = smp_processor_id(); + if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER)) + cpu = housekeeping_any_cpu(HK_TYPE_TIMER); add_timer_on(timer, cpu); - else - add_timer(timer); + } else { + if (likely(cpu == WORK_CPU_UNBOUND)) + add_timer(timer); + else + add_timer_on(timer, cpu); + } } /** @@ -1980,17 +2586,17 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, { struct work_struct *work = &dwork->work; bool ret = false; - unsigned long flags; + unsigned long irq_flags; /* read the comment in __queue_work() */ - local_irq_save(flags); + local_irq_save(irq_flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { __queue_delayed_work(cpu, wq, dwork, delay); ret = true; } - local_irq_restore(flags); + local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL(queue_delayed_work_on); @@ -2016,16 +2622,17 @@ EXPORT_SYMBOL(queue_delayed_work_on); bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { - unsigned long flags; + unsigned long irq_flags; int ret; do { - ret = try_to_grab_pending(&dwork->work, true, &flags); + ret = try_to_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, + &irq_flags); } while (unlikely(ret == -EAGAIN)); if (likely(ret >= 0)) { __queue_delayed_work(cpu, wq, dwork, delay); - local_irq_restore(flags); + local_irq_restore(irq_flags); } /* -ENOENT from try_to_grab_pending() becomes %true */ @@ -2100,19 +2707,21 @@ static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) * cpu-[un]hotplugs. */ static void worker_attach_to_pool(struct worker *worker, - struct worker_pool *pool) + struct worker_pool *pool) { mutex_lock(&wq_pool_attach_mutex); /* - * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains - * stable across this function. See the comments above the flag - * definition for details. + * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable + * across this function. See the comments above the flag definition for + * details. BH workers are, while per-CPU, always DISASSOCIATED. */ - if (pool->flags & POOL_DISASSOCIATED) + if (pool->flags & POOL_DISASSOCIATED) { worker->flags |= WORKER_UNBOUND; - else + } else { + WARN_ON_ONCE(pool->flags & POOL_BH); kthread_set_per_cpu(worker->task, pool->cpu); + } if (worker->rescue_wq) set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)); @@ -2136,6 +2745,9 @@ static void worker_detach_from_pool(struct worker *worker) struct worker_pool *pool = worker->pool; struct completion *detach_completion = NULL; + /* there is one permanent BH worker per CPU which should never detach */ + WARN_ON_ONCE(pool->flags & POOL_BH); + mutex_lock(&wq_pool_attach_mutex); kthread_set_per_cpu(worker->task, -1); @@ -2187,27 +2799,29 @@ static struct worker *create_worker(struct worker_pool *pool) worker->id = id; - if (pool->cpu >= 0) - snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, - pool->attrs->nice < 0 ? "H" : ""); - else - snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); - - worker->task = kthread_create_on_node(worker_thread, worker, pool->node, - "kworker/%s", id_buf); - if (IS_ERR(worker->task)) { - if (PTR_ERR(worker->task) == -EINTR) { - pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", - id_buf); - } else { - pr_err_once("workqueue: Failed to create a worker thread: %pe", - worker->task); + if (!(pool->flags & POOL_BH)) { + if (pool->cpu >= 0) + snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, + pool->attrs->nice < 0 ? "H" : ""); + else + snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); + + worker->task = kthread_create_on_node(worker_thread, worker, + pool->node, "kworker/%s", id_buf); + if (IS_ERR(worker->task)) { + if (PTR_ERR(worker->task) == -EINTR) { + pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", + id_buf); + } else { + pr_err_once("workqueue: Failed to create a worker thread: %pe", + worker->task); + } + goto fail; } - goto fail; - } - set_user_nice(worker->task, pool->attrs->nice); - kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); + set_user_nice(worker->task, pool->attrs->nice); + kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); + } /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); @@ -2217,14 +2831,14 @@ static struct worker *create_worker(struct worker_pool *pool) worker->pool->nr_workers++; worker_enter_idle(worker); - kick_pool(pool); /* * @worker is waiting on a completion in kthread() and will trigger hung - * check if not woken up soon. As kick_pool() might not have waken it - * up, wake it up explicitly once more. + * check if not woken up soon. As kick_pool() is noop if @pool is empty, + * wake it up explicitly. */ - wake_up_process(worker->task); + if (worker->task) + wake_up_process(worker->task); raw_spin_unlock_irq(&pool->lock); @@ -2543,6 +3157,8 @@ __acquires(&pool->lock) struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; unsigned long work_data; + int lockdep_start_depth, rcu_start_depth; + bool bh_draining = pool->flags & POOL_BH_DRAINING; #ifdef CONFIG_LOCKDEP /* * It is permissible to free the struct work_struct from @@ -2565,7 +3181,8 @@ __acquires(&pool->lock) worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; - worker->current_at = worker->task->se.sum_exec_runtime; + if (worker->task) + worker->current_at = worker->task->se.sum_exec_runtime; work_data = *work_data_bits(work); worker->current_color = get_work_color(work_data); @@ -2600,12 +3217,16 @@ __acquires(&pool->lock) * PENDING and queued state changes happen together while IRQ is * disabled. */ - set_work_pool_and_clear_pending(work, pool->id); + set_work_pool_and_clear_pending(work, pool->id, 0); pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); - lock_map_acquire(&pwq->wq->lockdep_map); + rcu_start_depth = rcu_preempt_depth(); + lockdep_start_depth = lockdep_depth(current); + /* see drain_dead_softirq_workfn() */ + if (!bh_draining) + lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); /* * Strictly speaking we should mark the invariant state without holding @@ -2638,12 +3259,17 @@ __acquires(&pool->lock) trace_workqueue_execute_end(work, worker->current_func); pwq->stats[PWQ_STAT_COMPLETED]++; lock_map_release(&lockdep_map); - lock_map_release(&pwq->wq->lockdep_map); + if (!bh_draining) + lock_map_release(&pwq->wq->lockdep_map); - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" - " last function: %ps\n", - current->comm, preempt_count(), task_pid_nr(current), + if (unlikely((worker->task && in_atomic()) || + lockdep_depth(current) != lockdep_start_depth || + rcu_preempt_depth() != rcu_start_depth)) { + pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n" + " preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n", + current->comm, task_pid_nr(current), preempt_count(), + lockdep_start_depth, lockdep_depth(current), + rcu_start_depth, rcu_preempt_depth(), worker->current_func); debug_show_held_locks(current); dump_stack(); @@ -2657,7 +3283,8 @@ __acquires(&pool->lock) * stop_machine. At the same time, report a quiescent RCU state so * the same condition doesn't freeze RCU. */ - cond_resched(); + if (worker->task) + cond_resched(); raw_spin_lock_irq(&pool->lock); @@ -2677,6 +3304,8 @@ __acquires(&pool->lock) worker->current_func = NULL; worker->current_pwq = NULL; worker->current_color = INT_MAX; + + /* must be the last step, see the function comment */ pwq_dec_nr_in_flight(pwq, work_data); } @@ -2938,6 +3567,139 @@ repeat: goto repeat; } +static void bh_worker(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + int nr_restarts = BH_WORKER_RESTARTS; + unsigned long end = jiffies + BH_WORKER_JIFFIES; + + raw_spin_lock_irq(&pool->lock); + worker_leave_idle(worker); + + /* + * This function follows the structure of worker_thread(). See there for + * explanations on each step. + */ + if (!need_more_worker(pool)) + goto done; + + WARN_ON_ONCE(!list_empty(&worker->scheduled)); + worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); + + do { + struct work_struct *work = + list_first_entry(&pool->worklist, + struct work_struct, entry); + + if (assign_work(work, worker, NULL)) + process_scheduled_works(worker); + } while (keep_working(pool) && + --nr_restarts && time_before(jiffies, end)); + + worker_set_flags(worker, WORKER_PREP); +done: + worker_enter_idle(worker); + kick_pool(pool); + raw_spin_unlock_irq(&pool->lock); +} + +/* + * TODO: Convert all tasklet users to workqueue and use softirq directly. + * + * This is currently called from tasklet[_hi]action() and thus is also called + * whenever there are tasklets to run. Let's do an early exit if there's nothing + * queued. Once conversion from tasklet is complete, the need_more_worker() test + * can be dropped. + * + * After full conversion, we'll add worker->softirq_action, directly use the + * softirq action and obtain the worker pointer from the softirq_action pointer. + */ +void workqueue_softirq_action(bool highpri) +{ + struct worker_pool *pool = + &per_cpu(bh_worker_pools, smp_processor_id())[highpri]; + if (need_more_worker(pool)) + bh_worker(list_first_entry(&pool->workers, struct worker, node)); +} + +struct wq_drain_dead_softirq_work { + struct work_struct work; + struct worker_pool *pool; + struct completion done; +}; + +static void drain_dead_softirq_workfn(struct work_struct *work) +{ + struct wq_drain_dead_softirq_work *dead_work = + container_of(work, struct wq_drain_dead_softirq_work, work); + struct worker_pool *pool = dead_work->pool; + bool repeat; + + /* + * @pool's CPU is dead and we want to execute its still pending work + * items from this BH work item which is running on a different CPU. As + * its CPU is dead, @pool can't be kicked and, as work execution path + * will be nested, a lockdep annotation needs to be suppressed. Mark + * @pool with %POOL_BH_DRAINING for the special treatments. + */ + raw_spin_lock_irq(&pool->lock); + pool->flags |= POOL_BH_DRAINING; + raw_spin_unlock_irq(&pool->lock); + + bh_worker(list_first_entry(&pool->workers, struct worker, node)); + + raw_spin_lock_irq(&pool->lock); + pool->flags &= ~POOL_BH_DRAINING; + repeat = need_more_worker(pool); + raw_spin_unlock_irq(&pool->lock); + + /* + * bh_worker() might hit consecutive execution limit and bail. If there + * still are pending work items, reschedule self and return so that we + * don't hog this CPU's BH. + */ + if (repeat) { + if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) + queue_work(system_bh_highpri_wq, work); + else + queue_work(system_bh_wq, work); + } else { + complete(&dead_work->done); + } +} + +/* + * @cpu is dead. Drain the remaining BH work items on the current CPU. It's + * possible to allocate dead_work per CPU and avoid flushing. However, then we + * have to worry about draining overlapping with CPU coming back online or + * nesting (one CPU's dead_work queued on another CPU which is also dead and so + * on). Let's keep it simple and drain them synchronously. These are BH work + * items which shouldn't be requeued on the same pool. Shouldn't take long. + */ +void workqueue_softirq_dead(unsigned int cpu) +{ + int i; + + for (i = 0; i < NR_STD_WORKER_POOLS; i++) { + struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i]; + struct wq_drain_dead_softirq_work dead_work; + + if (!need_more_worker(pool)) + continue; + + INIT_WORK(&dead_work.work, drain_dead_softirq_workfn); + dead_work.pool = pool; + init_completion(&dead_work.done); + + if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) + queue_work(system_bh_highpri_wq, &dead_work.work); + else + queue_work(system_bh_wq, &dead_work.work); + + wait_for_completion(&dead_work.done); + } +} + /** * check_flush_dependency - check for flush dependency sanity * @target_wq: workqueue being flushed @@ -3010,6 +3772,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { + static __maybe_unused struct lock_class_key bh_key, thr_key; unsigned int work_flags = 0; unsigned int work_color; struct list_head *head; @@ -3019,15 +3782,20 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. + * + * BH and threaded workqueues need separate lockdep keys to avoid + * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} + * usage". */ - INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); + INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func, + (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion_map(&barr->done, &target->lockdep_map); barr->task = current; - /* The barrier work item does not participate in pwq->nr_active. */ + /* The barrier work item does not participate in nr_active. */ work_flags |= WORK_STRUCT_INACTIVE; /* @@ -3124,6 +3892,35 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, return wait; } +static void touch_wq_lockdep_map(struct workqueue_struct *wq) +{ +#ifdef CONFIG_LOCKDEP + if (wq->flags & WQ_BH) + local_bh_disable(); + + lock_map_acquire(&wq->lockdep_map); + lock_map_release(&wq->lockdep_map); + + if (wq->flags & WQ_BH) + local_bh_enable(); +#endif +} + +static void touch_work_lockdep_map(struct work_struct *work, + struct workqueue_struct *wq) +{ +#ifdef CONFIG_LOCKDEP + if (wq->flags & WQ_BH) + local_bh_disable(); + + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + + if (wq->flags & WQ_BH) + local_bh_enable(); +#endif +} + /** * __flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush @@ -3143,8 +3940,7 @@ void __flush_workqueue(struct workqueue_struct *wq) if (WARN_ON(!wq_online)) return; - lock_map_acquire(&wq->lockdep_map); - lock_map_release(&wq->lockdep_map); + touch_wq_lockdep_map(wq); mutex_lock(&wq->mutex); @@ -3316,7 +4112,7 @@ reflush: bool drained; raw_spin_lock_irq(&pwq->pool->lock); - drained = !pwq->nr_active && list_empty(&pwq->inactive_works); + drained = pwq_is_empty(pwq); raw_spin_unlock_irq(&pwq->pool->lock); if (drained) @@ -3343,6 +4139,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, struct worker *worker = NULL; struct worker_pool *pool; struct pool_workqueue *pwq; + struct workqueue_struct *wq; might_sleep(); @@ -3366,11 +4163,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, pwq = worker->current_pwq; } - check_flush_dependency(pwq->wq, work); + wq = pwq->wq; + check_flush_dependency(wq, work); insert_wq_barrier(pwq, barr, work, worker); raw_spin_unlock_irq(&pool->lock); + touch_work_lockdep_map(work, wq); + /* * Force a lock recursion deadlock when using flush_work() inside a * single-threaded or rescuer equipped workqueue. @@ -3380,11 +4180,9 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, * workqueues the deadlock happens when the rescuer stalls, blocking * forward progress. */ - if (!from_cancel && - (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) { - lock_map_acquire(&pwq->wq->lockdep_map); - lock_map_release(&pwq->wq->lockdep_map); - } + if (!from_cancel && (wq->saved_max_active == 1 || wq->rescuer)) + touch_wq_lockdep_map(wq); + rcu_read_unlock(); return true; already_gone: @@ -3403,9 +4201,6 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!work->func)) return false; - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - if (start_flush_work(work, &barr, from_cancel)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); @@ -3432,108 +4227,6 @@ bool flush_work(struct work_struct *work) } EXPORT_SYMBOL_GPL(flush_work); -struct cwt_wait { - wait_queue_entry_t wait; - struct work_struct *work; -}; - -static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) -{ - struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); - - if (cwait->work != key) - return 0; - return autoremove_wake_function(wait, mode, sync, key); -} - -static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) -{ - static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); - unsigned long flags; - int ret; - - do { - ret = try_to_grab_pending(work, is_dwork, &flags); - /* - * If someone else is already canceling, wait for it to - * finish. flush_work() doesn't work for PREEMPT_NONE - * because we may get scheduled between @work's completion - * and the other canceling task resuming and clearing - * CANCELING - flush_work() will return false immediately - * as @work is no longer busy, try_to_grab_pending() will - * return -ENOENT as @work is still being canceled and the - * other canceling task won't be able to clear CANCELING as - * we're hogging the CPU. - * - * Let's wait for completion using a waitqueue. As this - * may lead to the thundering herd problem, use a custom - * wake function which matches @work along with exclusive - * wait and wakeup. - */ - if (unlikely(ret == -ENOENT)) { - struct cwt_wait cwait; - - init_wait(&cwait.wait); - cwait.wait.func = cwt_wakefn; - cwait.work = work; - - prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, - TASK_UNINTERRUPTIBLE); - if (work_is_canceling(work)) - schedule(); - finish_wait(&cancel_waitq, &cwait.wait); - } - } while (unlikely(ret < 0)); - - /* tell other tasks trying to grab @work to back off */ - mark_work_canceling(work); - local_irq_restore(flags); - - /* - * This allows canceling during early boot. We know that @work - * isn't executing. - */ - if (wq_online) - __flush_work(work, true); - - clear_work_data(work); - - /* - * Paired with prepare_to_wait() above so that either - * waitqueue_active() is visible here or !work_is_canceling() is - * visible there. - */ - smp_mb(); - if (waitqueue_active(&cancel_waitq)) - __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); - - return ret; -} - -/** - * cancel_work_sync - cancel a work and wait for it to finish - * @work: the work to cancel - * - * Cancel @work and wait for its execution to finish. This function - * can be used even if the work re-queues itself or migrates to - * another workqueue. On return from this function, @work is - * guaranteed to be not pending or executing on any CPU. - * - * cancel_work_sync(&delayed_work->work) must not be used for - * delayed_work's. Use cancel_delayed_work_sync() instead. - * - * The caller must ensure that the workqueue on which @work was last - * queued can't be destroyed before this function returns. - * - * Return: - * %true if @work was pending, %false otherwise. - */ -bool cancel_work_sync(struct work_struct *work) -{ - return __cancel_work_timer(work, false); -} -EXPORT_SYMBOL_GPL(cancel_work_sync); - /** * flush_delayed_work - wait for a dwork to finish executing the last queueing * @dwork: the delayed work to flush @@ -3576,20 +4269,50 @@ bool flush_rcu_work(struct rcu_work *rwork) } EXPORT_SYMBOL(flush_rcu_work); -static bool __cancel_work(struct work_struct *work, bool is_dwork) +static bool __cancel_work(struct work_struct *work, u32 cflags) { - unsigned long flags; + unsigned long irq_flags; int ret; do { - ret = try_to_grab_pending(work, is_dwork, &flags); + ret = try_to_grab_pending(work, cflags, &irq_flags); } while (unlikely(ret == -EAGAIN)); if (unlikely(ret < 0)) return false; - set_work_pool_and_clear_pending(work, get_work_pool_id(work)); - local_irq_restore(flags); + set_work_pool_and_clear_pending(work, get_work_pool_id(work), 0); + local_irq_restore(irq_flags); + return ret; +} + +static bool __cancel_work_sync(struct work_struct *work, u32 cflags) +{ + unsigned long irq_flags; + bool ret; + + /* claim @work and tell other tasks trying to grab @work to back off */ + ret = work_grab_pending(work, cflags, &irq_flags); + mark_work_canceling(work); + local_irq_restore(irq_flags); + + /* + * Skip __flush_work() during early boot when we know that @work isn't + * executing. This allows canceling during early boot. + */ + if (wq_online) + __flush_work(work, true); + + /* + * smp_mb() at the end of set_work_pool_and_clear_pending() is paired + * with prepare_to_wait() above so that either waitqueue_active() is + * visible here or !work_is_canceling() is visible there. + */ + set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE, 0); + + if (waitqueue_active(&wq_cancel_waitq)) + __wake_up(&wq_cancel_waitq, TASK_NORMAL, 1, work); + return ret; } @@ -3598,11 +4321,35 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork) */ bool cancel_work(struct work_struct *work) { - return __cancel_work(work, false); + return __cancel_work(work, 0); } EXPORT_SYMBOL(cancel_work); /** + * cancel_work_sync - cancel a work and wait for it to finish + * @work: the work to cancel + * + * Cancel @work and wait for its execution to finish. This function + * can be used even if the work re-queues itself or migrates to + * another workqueue. On return from this function, @work is + * guaranteed to be not pending or executing on any CPU. + * + * cancel_work_sync(&delayed_work->work) must not be used for + * delayed_work's. Use cancel_delayed_work_sync() instead. + * + * The caller must ensure that the workqueue on which @work was last + * queued can't be destroyed before this function returns. + * + * Return: + * %true if @work was pending, %false otherwise. + */ +bool cancel_work_sync(struct work_struct *work) +{ + return __cancel_work_sync(work, 0); +} +EXPORT_SYMBOL_GPL(cancel_work_sync); + +/** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel * @@ -3620,7 +4367,7 @@ EXPORT_SYMBOL(cancel_work); */ bool cancel_delayed_work(struct delayed_work *dwork) { - return __cancel_work(&dwork->work, true); + return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED); } EXPORT_SYMBOL(cancel_delayed_work); @@ -3635,7 +4382,7 @@ EXPORT_SYMBOL(cancel_delayed_work); */ bool cancel_delayed_work_sync(struct delayed_work *dwork) { - return __cancel_work_timer(&dwork->work, true); + return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED); } EXPORT_SYMBOL(cancel_delayed_work_sync); @@ -3927,11 +4674,66 @@ static void wq_free_lockdep(struct workqueue_struct *wq) } #endif +static void free_node_nr_active(struct wq_node_nr_active **nna_ar) +{ + int node; + + for_each_node(node) { + kfree(nna_ar[node]); + nna_ar[node] = NULL; + } + + kfree(nna_ar[nr_node_ids]); + nna_ar[nr_node_ids] = NULL; +} + +static void init_node_nr_active(struct wq_node_nr_active *nna) +{ + nna->max = WQ_DFL_MIN_ACTIVE; + atomic_set(&nna->nr, 0); + raw_spin_lock_init(&nna->lock); + INIT_LIST_HEAD(&nna->pending_pwqs); +} + +/* + * Each node's nr_active counter will be accessed mostly from its own node and + * should be allocated in the node. + */ +static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar) +{ + struct wq_node_nr_active *nna; + int node; + + for_each_node(node) { + nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node); + if (!nna) + goto err_free; + init_node_nr_active(nna); + nna_ar[node] = nna; + } + + /* [nr_node_ids] is used as the fallback */ + nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE); + if (!nna) + goto err_free; + init_node_nr_active(nna); + nna_ar[nr_node_ids] = nna; + + return 0; + +err_free: + free_node_nr_active(nna_ar); + return -ENOMEM; +} + static void rcu_free_wq(struct rcu_head *rcu) { struct workqueue_struct *wq = container_of(rcu, struct workqueue_struct, rcu); + if (wq->flags & WQ_UNBOUND) + free_node_nr_active(wq->node_nr_active); + wq_free_lockdep(wq); free_percpu(wq->cpu_pwq); free_workqueue_attrs(wq->unbound_attrs); @@ -4121,6 +4923,13 @@ static void pwq_release_workfn(struct kthread_work *work) mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); + + /* + * For ordered workqueue with a plugged dfl_pwq, restart it now. + */ + if (!is_last && (wq->flags & __WQ_ORDERED)) + unplug_oldest_pwq(wq); + mutex_unlock(&wq->mutex); } @@ -4130,6 +4939,15 @@ static void pwq_release_workfn(struct kthread_work *work) mutex_unlock(&wq_pool_mutex); } + if (!list_empty(&pwq->pending_node)) { + struct wq_node_nr_active *nna = + wq_node_nr_active(pwq->wq, pwq->pool->node); + + raw_spin_lock_irq(&nna->lock); + list_del_init(&pwq->pending_node); + raw_spin_unlock_irq(&nna->lock); + } + call_rcu(&pwq->rcu, rcu_free_pwq); /* @@ -4142,55 +4960,11 @@ static void pwq_release_workfn(struct kthread_work *work) } } -/** - * pwq_adjust_max_active - update a pwq's max_active to the current setting - * @pwq: target pool_workqueue - * - * If @pwq isn't freezing, set @pwq->max_active to the associated - * workqueue's saved_max_active and activate inactive work items - * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. - */ -static void pwq_adjust_max_active(struct pool_workqueue *pwq) -{ - struct workqueue_struct *wq = pwq->wq; - bool freezable = wq->flags & WQ_FREEZABLE; - unsigned long flags; - - /* for @wq->saved_max_active */ - lockdep_assert_held(&wq->mutex); - - /* fast exit for non-freezable wqs */ - if (!freezable && pwq->max_active == wq->saved_max_active) - return; - - /* this function can be called during early boot w/ irq disabled */ - raw_spin_lock_irqsave(&pwq->pool->lock, flags); - - /* - * During [un]freezing, the caller is responsible for ensuring that - * this function is called at least once after @workqueue_freezing - * is updated and visible. - */ - if (!freezable || !workqueue_freezing) { - pwq->max_active = wq->saved_max_active; - - while (!list_empty(&pwq->inactive_works) && - pwq->nr_active < pwq->max_active) - pwq_activate_first_inactive(pwq); - - kick_pool(pwq->pool); - } else { - pwq->max_active = 0; - } - - raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); -} - /* initialize newly allocated @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { - BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); + BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK); memset(pwq, 0, sizeof(*pwq)); @@ -4199,6 +4973,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->flush_color = -1; pwq->refcnt = 1; INIT_LIST_HEAD(&pwq->inactive_works); + INIT_LIST_HEAD(&pwq->pending_node); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); kthread_init_work(&pwq->release_work, pwq_release_workfn); @@ -4218,11 +4993,8 @@ static void link_pwq(struct pool_workqueue *pwq) /* set the matching work_color */ pwq->work_color = wq->work_color; - /* sync max_active to the current setting */ - pwq_adjust_max_active(pwq); - /* link in @pwq */ - list_add_rcu(&pwq->pwqs_node, &wq->pwqs); + list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } /* obtain a pool matching @attr and create a pwq associating the pool and @wq */ @@ -4289,10 +5061,11 @@ static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu, "possible intersect\n"); } -/* install @pwq into @wq's cpu_pwq and return the old pwq */ +/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */ static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, int cpu, struct pool_workqueue *pwq) { + struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu); struct pool_workqueue *old_pwq; lockdep_assert_held(&wq_pool_mutex); @@ -4301,8 +5074,8 @@ static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, /* link_pwq() can handle duplicate calls */ link_pwq(pwq); - old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); - rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq); + old_pwq = rcu_access_pointer(*slot); + rcu_assign_pointer(*slot, pwq); return old_pwq; } @@ -4383,6 +5156,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->attrs = new_attrs; + /* + * For initialized ordered workqueues, there should only be one pwq + * (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution + * of newly queued work items until execution of older work items in + * the old pwq's have completed. + */ + if ((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)) + ctx->dfl_pwq->plugged = true; + ctx->wq = wq; return ctx; @@ -4402,14 +5184,19 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); - /* save the previous pwq and install the new one */ + /* save the previous pwqs and install the new ones */ for_each_possible_cpu(cpu) ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, ctx->pwq_tbl[cpu]); + ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq); + + /* update node_nr_active->max */ + wq_update_node_max_active(ctx->wq, -1); - /* @dfl_pwq might not have been used, ensure it's linked */ - link_pwq(ctx->dfl_pwq); - swap(ctx->wq->dfl_pwq, ctx->dfl_pwq); + /* rescuer needs to respect wq cpumask changes */ + if (ctx->wq->rescuer) + set_cpus_allowed_ptr(ctx->wq->rescuer->task, + unbound_effective_cpumask(ctx->wq)); mutex_unlock(&ctx->wq->mutex); } @@ -4423,14 +5210,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, if (WARN_ON(!(wq->flags & WQ_UNBOUND))) return -EINVAL; - /* creating multiple pwqs breaks ordering guarantee */ - if (!list_empty(&wq->pwqs)) { - if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) - return -EINVAL; - - wq->flags &= ~__WQ_ORDERED; - } - ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); if (IS_ERR(ctx)) return PTR_ERR(ctx); @@ -4519,9 +5298,7 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu, /* nothing to do if the target cpumask matches the current pwq */ wq_calc_pod_cpumask(target_attrs, cpu, off_cpu); - pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), - lockdep_is_held(&wq_pool_mutex)); - if (wqattrs_equal(target_attrs, pwq->pool->attrs)) + if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs)) return; /* create a new pwq */ @@ -4539,10 +5316,11 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu, use_dfl_pwq: mutex_lock(&wq->mutex); - raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); - get_pwq(wq->dfl_pwq); - raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); - old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq); + pwq = unbound_pwq(wq, -1); + raw_spin_lock_irq(&pwq->pool->lock); + get_pwq(pwq); + raw_spin_unlock_irq(&pwq->pool->lock); + old_pwq = install_unbound_pwq(wq, cpu, pwq); out_unlock: mutex_unlock(&wq->mutex); put_pwq_unlocked(old_pwq); @@ -4559,10 +5337,17 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) if (!(wq->flags & WQ_UNBOUND)) { for_each_possible_cpu(cpu) { - struct pool_workqueue **pwq_p = - per_cpu_ptr(wq->cpu_pwq, cpu); - struct worker_pool *pool = - &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]); + struct pool_workqueue **pwq_p; + struct worker_pool __percpu *pools; + struct worker_pool *pool; + + if (wq->flags & WQ_BH) + pools = bh_worker_pools; + else + pools = cpu_worker_pools; + + pool = &(per_cpu_ptr(pools, cpu)[highpri]); + pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); @@ -4580,10 +5365,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) cpus_read_lock(); if (wq->flags & __WQ_ORDERED) { + struct pool_workqueue *dfl_pwq; + ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ - WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || - wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), + dfl_pwq = rcu_access_pointer(wq->dfl_pwq); + WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node || + wq->pwqs.prev != &dfl_pwq->pwqs_node), "ordering guarantee broken for workqueue %s\n", wq->name); } else { ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); @@ -4652,12 +5440,78 @@ static int init_rescuer(struct workqueue_struct *wq) } wq->rescuer = rescuer; - kthread_bind_mask(rescuer->task, cpu_possible_mask); + if (wq->flags & WQ_UNBOUND) + kthread_bind_mask(rescuer->task, wq_unbound_cpumask); + else + kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); return 0; } +/** + * wq_adjust_max_active - update a wq's max_active to the current setting + * @wq: target workqueue + * + * If @wq isn't freezing, set @wq->max_active to the saved_max_active and + * activate inactive work items accordingly. If @wq is freezing, clear + * @wq->max_active to zero. + */ +static void wq_adjust_max_active(struct workqueue_struct *wq) +{ + bool activated; + int new_max, new_min; + + lockdep_assert_held(&wq->mutex); + + if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) { + new_max = 0; + new_min = 0; + } else { + new_max = wq->saved_max_active; + new_min = wq->saved_min_active; + } + + if (wq->max_active == new_max && wq->min_active == new_min) + return; + + /* + * Update @wq->max/min_active and then kick inactive work items if more + * active work items are allowed. This doesn't break work item ordering + * because new work items are always queued behind existing inactive + * work items if there are any. + */ + WRITE_ONCE(wq->max_active, new_max); + WRITE_ONCE(wq->min_active, new_min); + + if (wq->flags & WQ_UNBOUND) + wq_update_node_max_active(wq, -1); + + if (new_max == 0) + return; + + /* + * Round-robin through pwq's activating the first inactive work item + * until max_active is filled. + */ + do { + struct pool_workqueue *pwq; + + activated = false; + for_each_pwq(pwq, wq) { + unsigned long irq_flags; + + /* can be called during early boot w/ irq disabled */ + raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags); + if (pwq_activate_first_inactive(pwq, true)) { + activated = true; + kick_pool(pwq->pool); + } + raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags); + } + } while (activated); +} + __printf(1, 4) struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, @@ -4665,23 +5519,27 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, { va_list args; struct workqueue_struct *wq; - struct pool_workqueue *pwq; + size_t wq_size; + int name_len; - /* - * Unbound && max_active == 1 used to imply ordered, which is no longer - * the case on many machines due to per-pod pools. While - * alloc_ordered_workqueue() is the right way to create an ordered - * workqueue, keep the previous behavior to avoid subtle breakages. - */ - if ((flags & WQ_UNBOUND) && max_active == 1) - flags |= __WQ_ORDERED; + if (flags & WQ_BH) { + if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS)) + return NULL; + if (WARN_ON_ONCE(max_active)) + return NULL; + } /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; /* allocate wq and format name */ - wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (flags & WQ_UNBOUND) + wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1); + else + wq_size = sizeof(*wq); + + wq = kzalloc(wq_size, GFP_KERNEL); if (!wq) return NULL; @@ -4692,15 +5550,30 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, } va_start(args, max_active); - vsnprintf(wq->name, sizeof(wq->name), fmt, args); + name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); va_end(args); - max_active = max_active ?: WQ_DFL_ACTIVE; - max_active = wq_clamp_max_active(max_active, flags, wq->name); + if (name_len >= WQ_NAME_LEN) + pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", + wq->name); + + if (flags & WQ_BH) { + /* + * BH workqueues always share a single execution context per CPU + * and don't impose any max_active limit. + */ + max_active = INT_MAX; + } else { + max_active = max_active ?: WQ_DFL_ACTIVE; + max_active = wq_clamp_max_active(max_active, flags, wq->name); + } /* init wq */ wq->flags = flags; - wq->saved_max_active = max_active; + wq->max_active = max_active; + wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE); + wq->saved_max_active = wq->max_active; + wq->saved_min_active = wq->min_active; mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->pwqs); @@ -4711,8 +5584,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, wq_init_lockdep(wq); INIT_LIST_HEAD(&wq->list); + if (flags & WQ_UNBOUND) { + if (alloc_node_nr_active(wq->node_nr_active) < 0) + goto err_unreg_lockdep; + } + if (alloc_and_link_pwqs(wq) < 0) - goto err_unreg_lockdep; + goto err_free_node_nr_active; if (wq_online && init_rescuer(wq) < 0) goto err_destroy; @@ -4728,8 +5606,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); + wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); list_add_tail_rcu(&wq->list, &workqueues); @@ -4738,6 +5615,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, return wq; +err_free_node_nr_active: + if (wq->flags & WQ_UNBOUND) + free_node_nr_active(wq->node_nr_active); err_unreg_lockdep: wq_unregister_lockdep(wq); wq_free_lockdep(wq); @@ -4759,9 +5639,9 @@ static bool pwq_busy(struct pool_workqueue *pwq) if (pwq->nr_in_flight[i]) return true; - if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) + if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1)) return true; - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) + if (!pwq_is_empty(pwq)) return true; return false; @@ -4843,13 +5723,12 @@ void destroy_workqueue(struct workqueue_struct *wq) rcu_read_lock(); for_each_possible_cpu(cpu) { - pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); - RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL); - put_pwq_unlocked(pwq); + put_pwq_unlocked(unbound_pwq(wq, cpu)); + RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL); } - put_pwq_unlocked(wq->dfl_pwq); - wq->dfl_pwq = NULL; + put_pwq_unlocked(unbound_pwq(wq, -1)); + RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL); rcu_read_unlock(); } @@ -4860,34 +5739,63 @@ EXPORT_SYMBOL_GPL(destroy_workqueue); * @wq: target workqueue * @max_active: new max_active value. * - * Set max_active of @wq to @max_active. + * Set max_active of @wq to @max_active. See the alloc_workqueue() function + * comment. * * CONTEXT: * Don't call from IRQ context. */ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { - struct pool_workqueue *pwq; - + /* max_active doesn't mean anything for BH workqueues */ + if (WARN_ON(wq->flags & WQ_BH)) + return; /* disallow meddling with max_active for ordered workqueues */ - if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) + if (WARN_ON(wq->flags & __WQ_ORDERED)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); - wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; + if (wq->flags & WQ_UNBOUND) + wq->saved_min_active = min(wq->saved_min_active, max_active); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); + wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(workqueue_set_max_active); /** + * workqueue_set_min_active - adjust min_active of an unbound workqueue + * @wq: target unbound workqueue + * @min_active: new min_active value + * + * Set min_active of an unbound workqueue. Unlike other types of workqueues, an + * unbound workqueue is not guaranteed to be able to process max_active + * interdependent work items. Instead, an unbound workqueue is guaranteed to be + * able to process min_active number of interdependent work items which is + * %WQ_DFL_MIN_ACTIVE by default. + * + * Use this function to adjust the min_active value between 0 and the current + * max_active. + */ +void workqueue_set_min_active(struct workqueue_struct *wq, int min_active) +{ + /* min_active is only meaningful for non-ordered unbound workqueues */ + if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) != + WQ_UNBOUND)) + return; + + mutex_lock(&wq->mutex); + wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active); + wq_adjust_max_active(wq); + mutex_unlock(&wq->mutex); +} + +/** * current_work - retrieve %current task's work struct * * Determine if %current task is a workqueue worker and what it's working on. @@ -4972,7 +5880,7 @@ EXPORT_SYMBOL_GPL(workqueue_congested); unsigned int work_busy(struct work_struct *work) { struct worker_pool *pool; - unsigned long flags; + unsigned long irq_flags; unsigned int ret = 0; if (work_pending(work)) @@ -4981,10 +5889,10 @@ unsigned int work_busy(struct work_struct *work) rcu_read_lock(); pool = get_work_pool(work); if (pool) { - raw_spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, irq_flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - raw_spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, irq_flags); } rcu_read_unlock(); @@ -5069,7 +5977,24 @@ static void pr_cont_pool_info(struct worker_pool *pool) pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); if (pool->node != NUMA_NO_NODE) pr_cont(" node=%d", pool->node); - pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); + pr_cont(" flags=0x%x", pool->flags); + if (pool->flags & POOL_BH) + pr_cont(" bh%s", + pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); + else + pr_cont(" nice=%d", pool->attrs->nice); +} + +static void pr_cont_worker_id(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (pool->flags & WQ_BH) + pr_cont("bh%s", + pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); + else + pr_cont("%d%s", task_pid_nr(worker->task), + worker->rescue_wq ? "(RESCUER)" : ""); } struct pr_cont_work_struct { @@ -5128,8 +6053,8 @@ static void show_pwq(struct pool_workqueue *pwq) pr_info(" pwq %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" active=%d/%d refcnt=%d%s\n", - pwq->nr_active, pwq->max_active, pwq->refcnt, + pr_cont(" active=%d refcnt=%d%s\n", + pwq->nr_active, pwq->refcnt, !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); hash_for_each(pool->busy_hash, bkt, worker, hentry) { @@ -5146,10 +6071,9 @@ static void show_pwq(struct pool_workqueue *pwq) if (worker->current_pwq != pwq) continue; - pr_cont("%s %d%s:%ps", comma ? "," : "", - task_pid_nr(worker->task), - worker->rescue_wq ? "(RESCUER)" : "", - worker->current_func); + pr_cont(" %s", comma ? "," : ""); + pr_cont_worker_id(worker); + pr_cont(":%ps", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) pr_cont_work(false, work, &pcws); pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); @@ -5200,10 +6124,10 @@ void show_one_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; bool idle = true; - unsigned long flags; + unsigned long irq_flags; for_each_pwq(pwq, wq) { - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + if (!pwq_is_empty(pwq)) { idle = false; break; } @@ -5214,8 +6138,8 @@ void show_one_workqueue(struct workqueue_struct *wq) pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); for_each_pwq(pwq, wq) { - raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags); + if (!pwq_is_empty(pwq)) { /* * Defer printing to avoid deadlocks in console * drivers that queue work while holding locks @@ -5225,7 +6149,7 @@ void show_one_workqueue(struct workqueue_struct *wq) show_pwq(pwq); printk_deferred_exit(); } - raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_all_workqueues(). Avoid triggering @@ -5244,10 +6168,10 @@ static void show_one_worker_pool(struct worker_pool *pool) { struct worker *worker; bool first = true; - unsigned long flags; + unsigned long irq_flags; unsigned long hung = 0; - raw_spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, irq_flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; @@ -5268,14 +6192,14 @@ static void show_one_worker_pool(struct worker_pool *pool) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); list_for_each_entry(worker, &pool->idle_list, entry) { - pr_cont(" %s%d", first ? "idle: " : "", - task_pid_nr(worker->task)); + pr_cont(" %s", first ? "idle: " : ""); + pr_cont_worker_id(worker); first = false; } pr_cont("\n"); printk_deferred_exit(); next_pool: - raw_spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, irq_flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_all_workqueues(). Avoid triggering @@ -5542,13 +6466,15 @@ int workqueue_online_cpu(unsigned int cpu) mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { - mutex_lock(&wq_pool_attach_mutex); + /* BH pools aren't affected by hotplug */ + if (pool->flags & POOL_BH) + continue; + mutex_lock(&wq_pool_attach_mutex); if (pool->cpu == cpu) rebind_workers(pool); else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - mutex_unlock(&wq_pool_attach_mutex); } @@ -5562,6 +6488,10 @@ int workqueue_online_cpu(unsigned int cpu) for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, true); + + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, -1); + mutex_unlock(&wq->mutex); } } @@ -5590,6 +6520,10 @@ int workqueue_offline_cpu(unsigned int cpu) for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, false); + + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, cpu); + mutex_unlock(&wq->mutex); } } mutex_unlock(&wq_pool_mutex); @@ -5677,7 +6611,6 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe_key); void freeze_workqueues_begin(void) { struct workqueue_struct *wq; - struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); @@ -5686,8 +6619,7 @@ void freeze_workqueues_begin(void) list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); + wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } @@ -5752,7 +6684,6 @@ out_unlock: void thaw_workqueues(void) { struct workqueue_struct *wq; - struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); @@ -5764,8 +6695,7 @@ void thaw_workqueues(void) /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); + wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } @@ -5784,16 +6714,9 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) lockdep_assert_held(&wq_pool_mutex); list_for_each_entry(wq, &workqueues, list) { - if (!(wq->flags & WQ_UNBOUND)) + if (!(wq->flags & WQ_UNBOUND) || (wq->flags & __WQ_DESTROYING)) continue; - /* creating multiple pwqs breaks ordering guarantee */ - if (!list_empty(&wq->pwqs)) { - if (wq->flags & __WQ_ORDERED_EXPLICIT) - continue; - wq->flags &= ~__WQ_ORDERED; - } - ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); @@ -6307,11 +7230,10 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) int ret; /* - * Adjusting max_active or creating new pwqs by applying - * attributes breaks ordering guarantee. Disallow exposing ordered - * workqueues. + * Adjusting max_active breaks ordering guarantee. Disallow exposing + * ordered workqueues. */ - if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) + if (WARN_ON(wq->flags & __WQ_ORDERED)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); @@ -6408,10 +7330,10 @@ static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; static void show_cpu_pool_hog(struct worker_pool *pool) { struct worker *worker; - unsigned long flags; + unsigned long irq_flags; int bkt; - raw_spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, irq_flags); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (task_is_running(worker->task)) { @@ -6429,7 +7351,7 @@ static void show_cpu_pool_hog(struct worker_pool *pool) } } - raw_spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, irq_flags); } static void show_cpu_pools_hogs(void) @@ -6501,7 +7423,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) /* did we stall? */ if (time_after(now, ts + thresh)) { lockup_detected = true; - if (pool->cpu >= 0) { + if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) { pool->cpu_stall = true; cpu_pool_stall = true; } @@ -6584,6 +7506,16 @@ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ +static void bh_pool_kick_normal(struct irq_work *irq_work) +{ + raise_softirq_irqoff(TASKLET_SOFTIRQ); +} + +static void bh_pool_kick_highpri(struct irq_work *irq_work) +{ + raise_softirq_irqoff(HI_SOFTIRQ); +} + static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask) { if (!cpumask_intersects(wq_unbound_cpumask, mask)) { @@ -6595,6 +7527,22 @@ static void __init restrict_unbound_cpumask(const char *name, const struct cpuma cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask); } +static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int nice) +{ + BUG_ON(init_worker_pool(pool)); + pool->cpu = cpu; + cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); + cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); + pool->attrs->nice = nice; + pool->attrs->affn_strict = true; + pool->node = cpu_to_node(cpu); + + /* alloc pool ID */ + mutex_lock(&wq_pool_mutex); + BUG_ON(worker_pool_assign_id(pool)); + mutex_unlock(&wq_pool_mutex); +} + /** * workqueue_init_early - early init for workqueue subsystem * @@ -6609,6 +7557,8 @@ void __init workqueue_init_early(void) { struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM]; int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; + void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal, + bh_pool_kick_highpri }; int i, cpu; BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); @@ -6630,6 +7580,13 @@ void __init workqueue_init_early(void) wq_update_pod_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_pod_attrs_buf); + /* + * If nohz_full is enabled, set power efficient workqueue as unbound. + * This allows workqueue items to be moved to HK CPUs. + */ + if (housekeeping_enabled(HK_TYPE_TICK)) + wq_power_efficient = true; + /* initialize WQ_AFFN_SYSTEM pods */ pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); @@ -6643,25 +7600,21 @@ void __init workqueue_init_early(void) pt->pod_node[0] = NUMA_NO_NODE; pt->cpu_pod[0] = 0; - /* initialize CPU pools */ + /* initialize BH and CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; i = 0; - for_each_cpu_worker_pool(pool, cpu) { - BUG_ON(init_worker_pool(pool)); - pool->cpu = cpu; - cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); - cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); - pool->attrs->nice = std_nice[i++]; - pool->attrs->affn_strict = true; - pool->node = cpu_to_node(cpu); - - /* alloc pool ID */ - mutex_lock(&wq_pool_mutex); - BUG_ON(worker_pool_assign_id(pool)); - mutex_unlock(&wq_pool_mutex); + for_each_bh_worker_pool(pool, cpu) { + init_cpu_worker_pool(pool, cpu, std_nice[i]); + pool->flags |= POOL_BH; + init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]); + i++; } + + i = 0; + for_each_cpu_worker_pool(pool, cpu) + init_cpu_worker_pool(pool, cpu, std_nice[i++]); } /* create default unbound and ordered wq attrs */ @@ -6691,13 +7644,17 @@ void __init workqueue_init_early(void) WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", WQ_POWER_EFFICIENT, 0); - system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", + system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", WQ_FREEZABLE | WQ_POWER_EFFICIENT, 0); + system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); + system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", + WQ_BH | WQ_HIGHPRI, 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || - !system_freezable_power_efficient_wq); + !system_freezable_power_efficient_wq || + !system_bh_wq || !system_bh_highpri_wq); } static void __init wq_cpu_intensive_thresh_init(void) @@ -6763,9 +7720,10 @@ void __init workqueue_init(void) * up. Also, create a rescuer for workqueues that requested it. */ for_each_possible_cpu(cpu) { - for_each_cpu_worker_pool(pool, cpu) { + for_each_bh_worker_pool(pool, cpu) + pool->node = cpu_to_node(cpu); + for_each_cpu_worker_pool(pool, cpu) pool->node = cpu_to_node(cpu); - } } list_for_each_entry(wq, &workqueues, list) { @@ -6776,7 +7734,16 @@ void __init workqueue_init(void) mutex_unlock(&wq_pool_mutex); - /* create the initial workers */ + /* + * Create the initial workers. A BH pool has one pseudo worker that + * represents the shared BH execution context and thus doesn't get + * affected by hotplug events. Create the BH pseudo workers for all + * possible CPUs here. + */ + for_each_possible_cpu(cpu) + for_each_bh_worker_pool(pool, cpu) + BUG_ON(!create_worker(pool)); + for_each_online_cpu(cpu) { for_each_cpu_worker_pool(pool, cpu) { pool->flags &= ~POOL_DISASSOCIATED; @@ -6856,7 +7823,7 @@ static bool __init cpus_share_numa(int cpu0, int cpu1) /** * workqueue_init_topology - initialize CPU pods for unbound workqueues * - * This is the third step of there-staged workqueue subsystem initialization and + * This is the third step of three-staged workqueue subsystem initialization and * invoked after SMP and topology information are fully initialized. It * initializes the unbound CPU pods accordingly. */ @@ -6870,6 +7837,8 @@ void __init workqueue_init_topology(void) init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); + wq_topo_initialized = true; + mutex_lock(&wq_pool_mutex); /* @@ -6878,8 +7847,12 @@ void __init workqueue_init_topology(void) * combinations to apply per-pod sharing. */ list_for_each_entry(wq, &workqueues, list) { - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) wq_update_pod(wq, cpu, cpu, true); + if (wq->flags & WQ_UNBOUND) { + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, -1); + mutex_unlock(&wq->mutex); } } |