summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c231
-rw-r--r--kernel/bpf/cpumap.c2
-rw-r--r--kernel/bpf/helpers.c5
-rw-r--r--kernel/bpf/task_iter.c2
-rw-r--r--kernel/bpf/verifier.c9
-rw-r--r--kernel/cgroup/cgroup-v1.c2
-rw-r--r--kernel/cgroup/cgroup.c35
-rw-r--r--kernel/cgroup/cpuset.c10
-rw-r--r--kernel/context_tracking.c4
-rw-r--r--kernel/crash_core.c16
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/dma/debug.c5
-rw-r--r--kernel/dma/swiotlb.c3
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/exit.c41
-rw-r--r--kernel/fork.c151
-rw-r--r--kernel/futex/core.c15
-rw-r--r--kernel/futex/pi.c11
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/kexec_core.c1
-rw-r--r--kernel/kprobes.c4
-rw-r--r--kernel/nsproxy.c2
-rw-r--r--kernel/pid.c57
-rw-r--r--kernel/power/swap.c28
-rw-r--r--kernel/rcu/Kconfig13
-rw-r--r--kernel/rcu/rcu.h19
-rw-r--r--kernel/rcu/rcuscale.c6
-rw-r--r--kernel/rcu/rcutorture.c13
-rw-r--r--kernel/rcu/srcutree.c24
-rw-r--r--kernel/rcu/sync.c16
-rw-r--r--kernel/rcu/tasks.h135
-rw-r--r--kernel/rcu/tiny.c1
-rw-r--r--kernel/rcu/tree.c271
-rw-r--r--kernel/rcu/tree.h20
-rw-r--r--kernel/rcu/tree_exp.h86
-rw-r--r--kernel/rcu/tree_nocb.h69
-rw-r--r--kernel/rcu/tree_plugin.h52
-rw-r--r--kernel/sched/core.c17
-rw-r--r--kernel/sched/cpufreq_schedutil.c6
-rw-r--r--kernel/sched/membarrier.c6
-rw-r--r--kernel/signal.c110
-rw-r--r--kernel/sys.c50
-rw-r--r--kernel/time/clocksource.c25
-rw-r--r--kernel/time/hrtimer.c3
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--kernel/time/time_test.c2
-rw-r--r--kernel/trace/fprobe.c14
-rw-r--r--kernel/trace/ftrace.c10
-rw-r--r--kernel/trace/ring_buffer.c908
-rw-r--r--kernel/trace/ring_buffer_benchmark.c10
-rw-r--r--kernel/trace/trace.c333
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_boot.c2
-rw-r--r--kernel/trace/trace_btf.c4
-rw-r--r--kernel/trace/trace_events.c107
-rw-r--r--kernel/trace/trace_events_hist.c49
-rw-r--r--kernel/trace/trace_events_synth.c3
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_osnoise.c6
-rw-r--r--kernel/trace/trace_output.c6
-rw-r--r--kernel/trace/trace_probe.c32
-rw-r--r--kernel/trace/trace_probe.h3
-rw-r--r--kernel/trace/trace_seq.c3
-rw-r--r--kernel/trace/tracing_map.c7
65 files changed, 1942 insertions, 1161 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 51e8b4bee0c8..596471189176 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5615,21 +5615,46 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
-const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
- const struct btf_type *t, enum bpf_prog_type prog_type,
- int arg)
+static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type)
{
const struct btf_type *conv_struct;
- const struct btf_type *ctx_struct;
const struct btf_member *ctx_type;
- const char *tname, *ctx_tname;
conv_struct = bpf_ctx_convert.t;
- if (!conv_struct) {
- bpf_log(log, "btf_vmlinux is malformed\n");
+ if (!conv_struct)
return NULL;
- }
+ /* prog_type is valid bpf program type. No need for bounds check. */
+ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
+ /* ctx_type is a pointer to prog_ctx_type in vmlinux.
+ * Like 'struct __sk_buff'
+ */
+ return btf_type_by_id(btf_vmlinux, ctx_type->type);
+}
+
+static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
+{
+ const struct btf_type *conv_struct;
+ const struct btf_member *ctx_type;
+
+ conv_struct = bpf_ctx_convert.t;
+ if (!conv_struct)
+ return -EFAULT;
+ /* prog_type is valid bpf program type. No need for bounds check. */
+ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ /* ctx_type is a pointer to prog_ctx_type in vmlinux.
+ * Like 'struct sk_buff'
+ */
+ return ctx_type->type;
+}
+
+const struct btf_type *
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, enum bpf_prog_type prog_type,
+ int arg)
+{
+ const struct btf_type *ctx_type;
+ const char *tname, *ctx_tname;
+
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
@@ -5646,17 +5671,15 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
return NULL;
}
- /* prog_type is valid bpf program type. No need for bounds check. */
- ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
- /* ctx_struct is a pointer to prog_ctx_type in vmlinux.
- * Like 'struct __sk_buff'
- */
- ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type);
- if (!ctx_struct)
+
+ ctx_type = find_canonical_prog_ctx_type(prog_type);
+ if (!ctx_type) {
+ bpf_log(log, "btf_vmlinux is malformed\n");
/* should not happen */
return NULL;
+ }
again:
- ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
+ ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
if (!ctx_tname) {
/* should not happen */
bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
@@ -5677,28 +5700,167 @@ again:
/* bpf_user_pt_regs_t is a typedef, so resolve it to
* underlying struct and check name again
*/
- if (!btf_type_is_modifier(ctx_struct))
+ if (!btf_type_is_modifier(ctx_type))
return NULL;
- while (btf_type_is_modifier(ctx_struct))
- ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type);
+ while (btf_type_is_modifier(ctx_type))
+ ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
goto again;
}
return ctx_type;
}
+/* forward declarations for arch-specific underlying types of
+ * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef
+ * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still
+ * works correctly with __builtin_types_compatible_p() on respective
+ * architectures
+ */
+struct user_regs_struct;
+struct user_pt_regs;
+
+static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, int arg,
+ enum bpf_prog_type prog_type,
+ enum bpf_attach_type attach_type)
+{
+ const struct btf_type *ctx_type;
+ const char *tname, *ctx_tname;
+
+ if (!btf_is_ptr(t)) {
+ bpf_log(log, "arg#%d type isn't a pointer\n", arg);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, t->type);
+
+ /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */
+ if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) {
+ while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
+ t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_typedef(t)) {
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
+ return 0;
+ }
+ }
+
+ /* all other program types don't use typedefs for context type */
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+
+ /* `void *ctx __arg_ctx` is always valid */
+ if (btf_type_is_void(t))
+ return 0;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (str_is_empty(tname)) {
+ bpf_log(log, "arg#%d type doesn't have a name\n", arg);
+ return -EINVAL;
+ }
+
+ /* special cases */
+ switch (prog_type) {
+ case BPF_PROG_TYPE_KPROBE:
+ if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) &&
+ __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+ return 0;
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) &&
+ __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0)
+ return 0;
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) &&
+ __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_TRACING:
+ switch (attach_type) {
+ case BPF_TRACE_RAW_TP:
+ /* tp_btf program is TRACING, so need special case here */
+ if (__btf_type_is_struct(t) &&
+ strcmp(tname, "bpf_raw_tracepoint_args") == 0)
+ return 0;
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_TRACE_ITER:
+ /* allow struct bpf_iter__xxx types only */
+ if (__btf_type_is_struct(t) &&
+ strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0)
+ return 0;
+ break;
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ break;
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_SYSCALL:
+ case BPF_PROG_TYPE_EXT:
+ return 0; /* anything goes */
+ default:
+ break;
+ }
+
+ ctx_type = find_canonical_prog_ctx_type(prog_type);
+ if (!ctx_type) {
+ /* should not happen */
+ bpf_log(log, "btf_vmlinux is malformed\n");
+ return -EINVAL;
+ }
+
+ /* resolve typedefs and check that underlying structs are matching as well */
+ while (btf_type_is_modifier(ctx_type))
+ ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
+
+ /* if program type doesn't have distinctly named struct type for
+ * context, then __arg_ctx argument can only be `void *`, which we
+ * already checked above
+ */
+ if (!__btf_type_is_struct(ctx_type)) {
+ bpf_log(log, "arg#%d should be void pointer\n", arg);
+ return -EINVAL;
+ }
+
+ ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
+ if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) {
+ bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *t,
enum bpf_prog_type prog_type,
int arg)
{
- const struct btf_member *prog_ctx_type, *kern_ctx_type;
-
- prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
- if (!prog_ctx_type)
+ if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg))
return -ENOENT;
- kern_ctx_type = prog_ctx_type + 1;
- return kern_ctx_type->type;
+ return find_kern_ctx_type_id(prog_type);
}
int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
@@ -6934,6 +7096,23 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
return -EINVAL;
}
+ for (i = 0; i < nargs; i++) {
+ const char *tag;
+
+ if (sub->args[i].arg_type != ARG_PTR_TO_CTX)
+ continue;
+
+ /* check if arg has "arg:ctx" tag */
+ t = btf_type_by_id(btf, args[i].type);
+ tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
+ if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0)
+ continue;
+
+ if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
+ prog->expected_attach_type))
+ return -EINVAL;
+ }
+
sub->arg_cnt = nargs;
sub->args_cached = true;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8a0bb80fe48a..ef82ffc90cbe 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -178,7 +178,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
void **frames, int n,
struct xdp_cpumap_stats *stats)
{
- struct xdp_rxq_info rxq;
+ struct xdp_rxq_info rxq = {};
struct xdp_buff xdp;
int i, nframes = 0;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index be72824f32b2..d19cd863d294 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1101,6 +1101,7 @@ struct bpf_hrtimer {
struct bpf_prog *prog;
void __rcu *callback_fn;
void *value;
+ struct rcu_head rcu;
};
/* the actual struct hidden inside uapi struct bpf_timer */
@@ -1332,6 +1333,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
if (in_nmi())
return -EOPNOTSUPP;
+ rcu_read_lock();
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
if (!t) {
@@ -1353,6 +1355,7 @@ out:
* if it was running.
*/
ret = ret ?: hrtimer_cancel(&t->timer);
+ rcu_read_unlock();
return ret;
}
@@ -1407,7 +1410,7 @@ out:
*/
if (this_cpu_read(hrtimer_running) != t)
hrtimer_cancel(&t->timer);
- kfree(t);
+ kfree_rcu(t, rcu);
}
BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index e5c3500443c6..ec4e97c61eef 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -978,6 +978,8 @@ __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
__alignof__(struct bpf_iter_task));
+ kit->pos = NULL;
+
switch (flags) {
case BPF_TASK_ITER_ALL_THREADS:
case BPF_TASK_ITER_ALL_PROCS:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index adbf330d364b..ddea9567f755 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5227,7 +5227,9 @@ BTF_ID(struct, prog_test_ref_kfunc)
#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
#endif
+#ifdef CONFIG_BPF_JIT
BTF_ID(struct, bpf_cpumask)
+#endif
BTF_ID(struct, task_struct)
BTF_SET_END(rcu_protected_types)
@@ -12826,6 +12828,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
switch (base_type(ptr_reg->type)) {
+ case PTR_TO_FLOW_KEYS:
+ if (known)
+ break;
+ fallthrough;
case CONST_PTR_TO_MAP:
/* smin_val represents the known value */
if (known && smin_val == 0 && opcode == BPF_ADD)
@@ -16596,6 +16602,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
{
int i;
+ if (old->callback_depth > cur->callback_depth)
+ return false;
+
for (i = 0; i < MAX_BPF_REG; i++)
if (!regsafe(env, &old->regs[i], &cur->regs[i],
&env->idmap_scratch, exact))
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 04d11a7dd95f..520a11cb12f4 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -802,7 +802,7 @@ void cgroup1_release_agent(struct work_struct *work)
goto out_free;
ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
- if (ret < 0 || ret >= PATH_MAX)
+ if (ret < 0)
goto out_free;
argv[0] = agentbuf;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8f3cef1a4d8a..a66c088c851c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1906,7 +1906,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
spin_unlock_irq(&css_set_lock);
- if (len >= PATH_MAX)
+ if (len == -E2BIG)
len = -ERANGE;
else if (len > 0) {
seq_escape(sf, buf, " \t\n\\");
@@ -4182,20 +4182,6 @@ static struct kernfs_ops cgroup_kf_ops = {
.seq_show = cgroup_seqfile_show,
};
-/* set uid and gid of cgroup dirs and files to that of the creator */
-static int cgroup_kn_set_ugid(struct kernfs_node *kn)
-{
- struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = current_fsuid(),
- .ia_gid = current_fsgid(), };
-
- if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
- gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
- return 0;
-
- return kernfs_setattr(kn, &iattr);
-}
-
static void cgroup_file_notify_timer(struct timer_list *timer)
{
cgroup_file_notify(container_of(timer, struct cgroup_file,
@@ -4208,25 +4194,18 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
struct lock_class_key *key = NULL;
- int ret;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
key = &cft->lockdep_key;
#endif
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
cgroup_file_mode(cft),
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ current_fsuid(), current_fsgid(),
0, cft->kf_ops, cft,
NULL, key);
if (IS_ERR(kn))
return PTR_ERR(kn);
- ret = cgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
if (cft->file_offset) {
struct cgroup_file *cfile = (void *)css + cft->file_offset;
@@ -5629,7 +5608,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
goto out_cancel_ref;
/* create the directory */
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ kn = kernfs_create_dir_ns(parent->kn, name, mode,
+ current_fsuid(), current_fsgid(),
+ cgrp, NULL);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
goto out_stat_exit;
@@ -5774,10 +5755,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
*/
kernfs_get(cgrp->kn);
- ret = cgroup_kn_set_ugid(cgrp->kn);
- if (ret)
- goto out_destroy;
-
ret = css_populate_dir(&cgrp->self);
if (ret)
goto out_destroy;
@@ -6316,7 +6293,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
- if (retval >= PATH_MAX)
+ if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_unlock;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index dfbb16aca9f4..927bef3a598a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2562,7 +2562,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
update_partition_sd_lb(cs, old_prs);
out_free:
free_cpumasks(NULL, &tmp);
- return 0;
+ return retval;
}
/**
@@ -2598,9 +2598,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
return 0;
- if (alloc_cpumasks(NULL, &tmp))
- return -ENOMEM;
-
if (*buf)
compute_effective_exclusive_cpumask(trialcs, NULL);
@@ -2615,6 +2612,9 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval)
return retval;
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
+
if (old_prs) {
if (cpumask_empty(trialcs->effective_xcpus)) {
invalidate = true;
@@ -5092,7 +5092,7 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
css_put(css);
- if (retval >= PATH_MAX)
+ if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_free;
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6ef0b35fc28c..70ae70d03823 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -458,6 +458,8 @@ static __always_inline void context_tracking_recursion_exit(void)
* __ct_user_enter - Inform the context tracking that the CPU is going
* to enter user or guest space mode.
*
+ * @state: userspace context-tracking state to enter.
+ *
* This function must be called right before we switch from the kernel
* to user or guest space, when it's guaranteed the remaining kernel
* instructions to execute won't use any RCU read side critical section
@@ -595,6 +597,8 @@ NOKPROBE_SYMBOL(user_enter_callable);
* __ct_user_exit - Inform the context tracking that the CPU is
* exiting user or guest mode and entering the kernel.
*
+ * @state: userspace context-tracking state being exited from.
+ *
* This function must be called after we entered the kernel from user or
* guest space before any use of RCU read side critical section. This
* potentially include any high level kernel code like syscalls, exceptions,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index d48315667752..75cd6a736d03 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -376,7 +376,6 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
crashk_low_res.start = low_base;
crashk_low_res.end = low_base + low_size - 1;
- insert_resource(&iomem_resource, &crashk_low_res);
#endif
return 0;
}
@@ -458,8 +457,19 @@ retry:
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
- insert_resource(&iomem_resource, &crashk_res);
}
+
+static __init int insert_crashkernel_resources(void)
+{
+ if (crashk_res.start < crashk_res.end)
+ insert_resource(&iomem_resource, &crashk_res);
+
+ if (crashk_low_res.start < crashk_low_res.end)
+ insert_resource(&iomem_resource, &crashk_low_res);
+
+ return 0;
+}
+early_initcall(insert_crashkernel_resources);
#endif
int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
@@ -867,7 +877,7 @@ subsys_initcall(crash_notes_memory_init);
* regions are online. So mutex lock __crash_hotplug_lock is used to
* serialize the crash hotplug handling specifically.
*/
-DEFINE_MUTEX(__crash_hotplug_lock);
+static DEFINE_MUTEX(__crash_hotplug_lock);
#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 6b213c8252d6..d05066cb40b2 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1348,8 +1348,6 @@ do_full_getstr:
/* PROMPT can only be set if we have MEM_READ permission. */
snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
raw_smp_processor_id());
- if (defcmd_in_progress)
- strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
/*
* Fetch command from keyboard
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 3de494375b7b..a6e3792b15f8 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -62,7 +62,8 @@ enum map_err_types {
* @pfn: page frame of the start address
* @offset: offset of mapping relative to pfn
* @map_err_type: track whether dma_mapping_error() was checked
- * @stacktrace: support backtraces when a violation is detected
+ * @stack_len: number of backtrace entries in @stack_entries
+ * @stack_entries: stack of backtrace history
*/
struct dma_debug_entry {
struct list_head list;
@@ -876,7 +877,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti
return 0;
}
-void dma_debug_add_bus(struct bus_type *bus)
+void dma_debug_add_bus(const struct bus_type *bus)
{
struct notifier_block *nb;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 97c298b210bc..b079a9a8e087 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1136,6 +1136,9 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
int cpu, i;
int index;
+ if (alloc_size > IO_TLB_SEGSIZE * IO_TLB_SIZE)
+ return -1;
+
cpu = raw_smp_processor_id();
for (i = 0; i < default_nareas; ++i) {
index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size,
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 485bb0389b48..929e98c62965 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -537,7 +537,7 @@ retry:
}
}
- ret = __replace_page(vma, vaddr, old_page, new_page);
+ ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
if (new_page)
put_page(new_page);
put_old:
diff --git a/kernel/exit.c b/kernel/exit.c
index 3988a02efaef..41a12630cbbc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
tsk->exit_state = EXIT_ZOMBIE;
+ /*
+ * sub-thread or delay_group_leader(), wake up the
+ * PIDFD_THREAD waiters.
+ */
+ if (!thread_group_empty(tsk))
+ do_notify_pidfd(tsk);
+
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
@@ -1127,17 +1134,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* and nobody can change them.
*
* psig->stats_lock also protects us from our sub-threads
- * which can reap other children at the same time. Until
- * we change k_getrusage()-like users to rely on this lock
- * we have to take ->siglock as well.
+ * which can reap other children at the same time.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- spin_lock_irq(&current->sighand->siglock);
- write_seqlock(&psig->stats_lock);
+ write_seqlock_irq(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1160,8 +1164,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- write_sequnlock(&psig->stats_lock);
- spin_unlock_irq(&current->sighand->siglock);
+ write_sequnlock_irq(&psig->stats_lock);
}
if (wo->wo_rusage)
@@ -1893,30 +1896,6 @@ Efault:
}
#endif
-/**
- * thread_group_exited - check that a thread group has exited
- * @pid: tgid of thread group to be checked.
- *
- * Test if the thread group represented by tgid has exited (all
- * threads are zombies, dead or completely gone).
- *
- * Return: true if the thread group has exited. false otherwise.
- */
-bool thread_group_exited(struct pid *pid)
-{
- struct task_struct *task;
- bool exited;
-
- rcu_read_lock();
- task = pid_task(pid, PIDTYPE_PID);
- exited = !task ||
- (READ_ONCE(task->exit_state) && thread_group_empty(task));
- rcu_read_unlock();
-
- return exited;
-}
-EXPORT_SYMBOL(thread_group_exited);
-
/*
* This needs to be __function_aligned as GCC implicitly makes any
* implementation of abort() cold and drops alignment specified by
diff --git a/kernel/fork.c b/kernel/fork.c
index c981fa6171c1..39a5046c2f0b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -101,6 +101,8 @@
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
+#include <uapi/linux/pidfd.h>
+#include <linux/pidfs.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -1173,7 +1175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->use_memdelay = 0;
#endif
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
tsk->pasid_activated = 0;
#endif
@@ -1748,6 +1750,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
spin_lock(&fs->lock);
+ /* "users" and "in_exec" locked for check_unsafe_exec() */
if (fs->in_exec) {
spin_unlock(&fs->lock);
return -EAGAIN;
@@ -1975,6 +1978,7 @@ static inline void rcu_copy_process(struct task_struct *p)
p->rcu_tasks_holdout = false;
INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
p->rcu_tasks_idle_cpu = -1;
+ INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
p->trc_reader_nesting = 0;
@@ -1984,119 +1988,6 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
-struct pid *pidfd_pid(const struct file *file)
-{
- if (file->f_op == &pidfd_fops)
- return file->private_data;
-
- return ERR_PTR(-EBADF);
-}
-
-static int pidfd_release(struct inode *inode, struct file *file)
-{
- struct pid *pid = file->private_data;
-
- file->private_data = NULL;
- put_pid(pid);
- return 0;
-}
-
-#ifdef CONFIG_PROC_FS
-/**
- * pidfd_show_fdinfo - print information about a pidfd
- * @m: proc fdinfo file
- * @f: file referencing a pidfd
- *
- * Pid:
- * This function will print the pid that a given pidfd refers to in the
- * pid namespace of the procfs instance.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its pid. This is
- * similar to calling getppid() on a process whose parent is outside of
- * its pid namespace.
- *
- * NSpid:
- * If pid namespaces are supported then this function will also print
- * the pid of a given pidfd refers to for all descendant pid namespaces
- * starting from the current pid namespace of the instance, i.e. the
- * Pid field and the first entry in the NSpid field will be identical.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its first NSpid
- * entry and no others will be shown.
- * Note that this differs from the Pid and NSpid fields in
- * /proc/<pid>/status where Pid and NSpid are always shown relative to
- * the pid namespace of the procfs instance. The difference becomes
- * obvious when sending around a pidfd between pid namespaces from a
- * different branch of the tree, i.e. where no ancestral relation is
- * present between the pid namespaces:
- * - create two new pid namespaces ns1 and ns2 in the initial pid
- * namespace (also take care to create new mount namespaces in the
- * new pid namespace and mount procfs)
- * - create a process with a pidfd in ns1
- * - send pidfd from ns1 to ns2
- * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
- * have exactly one entry, which is 0
- */
-static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
-{
- struct pid *pid = f->private_data;
- struct pid_namespace *ns;
- pid_t nr = -1;
-
- if (likely(pid_has_task(pid, PIDTYPE_PID))) {
- ns = proc_pid_ns(file_inode(m->file)->i_sb);
- nr = pid_nr_ns(pid, ns);
- }
-
- seq_put_decimal_ll(m, "Pid:\t", nr);
-
-#ifdef CONFIG_PID_NS
- seq_put_decimal_ll(m, "\nNSpid:\t", nr);
- if (nr > 0) {
- int i;
-
- /* If nr is non-zero it means that 'pid' is valid and that
- * ns, i.e. the pid namespace associated with the procfs
- * instance, is in the pid namespace hierarchy of pid.
- * Start at one below the already printed level.
- */
- for (i = ns->level + 1; i <= pid->level; i++)
- seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
- }
-#endif
- seq_putc(m, '\n');
-}
-#endif
-
-/*
- * Poll support for process exit notification.
- */
-static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
-{
- struct pid *pid = file->private_data;
- __poll_t poll_flags = 0;
-
- poll_wait(file, &pid->wait_pidfd, pts);
-
- /*
- * Inform pollers only when the whole thread group exits.
- * If the thread group leader exits before all other threads in the
- * group, then poll(2) should block, similar to the wait(2) family.
- */
- if (thread_group_exited(pid))
- poll_flags = EPOLLIN | EPOLLRDNORM;
-
- return poll_flags;
-}
-
-const struct file_operations pidfd_fops = {
- .release = pidfd_release,
- .poll = pidfd_poll,
-#ifdef CONFIG_PROC_FS
- .show_fdinfo = pidfd_show_fdinfo,
-#endif
-};
-
/**
* __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
@@ -2130,20 +2021,20 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
int pidfd;
struct file *pidfd_file;
- if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
- return -EINVAL;
-
- pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ pidfd = get_unused_fd_flags(O_CLOEXEC);
if (pidfd < 0)
return pidfd;
- pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- flags | O_RDWR | O_CLOEXEC);
+ pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
if (IS_ERR(pidfd_file)) {
put_unused_fd(pidfd);
return PTR_ERR(pidfd_file);
}
- get_pid(pid); /* held by pidfd_file now */
+ /*
+ * anon_inode_getfile() ignores everything outside of the
+ * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
+ */
+ pidfd_file->f_flags |= (flags & PIDFD_THREAD);
*ret = pidfd_file;
return pidfd;
}
@@ -2157,7 +2048,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
*
- * The helper verifies that @pid is used as a thread group leader.
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
*
* If this function returns successfully the caller is responsible to either
* call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2176,7 +2068,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
*/
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
- if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ bool thread = flags & PIDFD_THREAD;
+
+ if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
return -EINVAL;
return __pidfd_prepare(pid, flags, ret);
@@ -2298,9 +2192,8 @@ __latent_entropy struct task_struct *copy_process(
/*
* - CLONE_DETACHED is blocked so that we can potentially
* reuse it later for CLONE_PIDFD.
- * - CLONE_THREAD is blocked until someone really needs it.
*/
- if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
+ if (clone_flags & CLONE_DETACHED)
return ERR_PTR(-EINVAL);
}
@@ -2523,8 +2416,10 @@ __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
+ int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+
/* Note that no task has been attached to @pid yet. */
- retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
+ retval = __pidfd_prepare(pid, flags, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
@@ -2875,8 +2770,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
* here has the advantage that we don't need to have a separate helper
* to check for legacy clone().
*/
- if ((args->flags & CLONE_PIDFD) &&
- (args->flags & CLONE_PARENT_SETTID) &&
+ if ((clone_flags & CLONE_PIDFD) &&
+ (clone_flags & CLONE_PARENT_SETTID) &&
(args->pidfd == args->parent_tid))
return -EINVAL;
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index e0e853412c15..1e78ef24321e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -627,12 +627,21 @@ retry:
}
/*
- * PI futexes can not be requeued and must remove themselves from the
- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
+ * PI futexes can not be requeued and must remove themselves from the hash
+ * bucket. The hash bucket lock (i.e. lock_ptr) is held.
*/
void futex_unqueue_pi(struct futex_q *q)
{
- __futex_unqueue(q);
+ /*
+ * If the lock was not acquired (due to timeout or signal) then the
+ * rt_waiter is removed before futex_q is. If this is observed by
+ * an unlocker after dropping the rtmutex wait lock and before
+ * acquiring the hash bucket lock, then the unlocker dequeues the
+ * futex_q from the hash bucket list to guarantee consistent state
+ * vs. userspace. Therefore the dequeue here must be conditional.
+ */
+ if (!plist_node_empty(&q->list))
+ __futex_unqueue(q);
BUG_ON(!q->pi_state);
put_pi_state(q->pi_state);
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 90e5197f4e56..5722467f2737 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1135,6 +1135,7 @@ retry:
hb = futex_hash(&key);
spin_lock(&hb->lock);
+retry_hb:
/*
* Check waiters first. We do not trust user space values at
@@ -1177,12 +1178,17 @@ retry:
/*
* Futex vs rt_mutex waiter state -- if there are no rt_mutex
* waiters even though futex thinks there are, then the waiter
- * is leaving and the uncontended path is safe to take.
+ * is leaving. The entry needs to be removed from the list so a
+ * new futex_lock_pi() is not using this stale PI-state while
+ * the futex is available in user space again.
+ * There can be more than one task on its way out so it needs
+ * to retry.
*/
rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
if (!rt_waiter) {
+ __futex_unqueue(top_waiter);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- goto do_uncontended;
+ goto retry_hb;
}
get_pi_state(pi_state);
@@ -1217,7 +1223,6 @@ retry:
return ret;
}
-do_uncontended:
/*
* We have no kernel internal state, i.e. no waiters in the
* kernel. Waiters which are about to queue themselves are stuck
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 27ca1c866f29..371eb1711d34 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -600,7 +600,7 @@ int __init early_irq_init(void)
mutex_init(&desc[i].request_mutex);
init_waitqueue_head(&desc[i].wait_for_threads);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
- irq_resend_init(desc);
+ irq_resend_init(&desc[i]);
}
return arch_early_irq_init();
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index a08031b57a61..d08fc7b5db97 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1257,6 +1257,7 @@ int kernel_kexec(void)
kexec_in_progress = true;
kernel_restart_prepare("kexec reboot");
migrate_to_reboot_cpu();
+ syscore_shutdown();
/*
* migrate_to_reboot_cpu() disables CPU hotplug assuming that
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d5a0ee40bf66..9d9095e81792 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1993,7 +1993,7 @@ NOKPROBE_SYMBOL(__kretprobe_find_ret_addr);
unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
struct llist_node **cur)
{
- struct kretprobe_instance *ri = NULL;
+ struct kretprobe_instance *ri;
kprobe_opcode_t *ret;
if (WARN_ON_ONCE(!cur))
@@ -2802,7 +2802,7 @@ static int show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
struct kprobe *p, *kp;
- const char *sym = NULL;
+ const char *sym;
unsigned int i = *(loff_t *) v;
unsigned long offset = 0;
char *modname, namebuf[KSYM_NAME_LEN];
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 15781acaac1c..6ec3deec68c2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -573,7 +573,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (proc_ns_file(f.file))
err = validate_ns(&nsset, ns);
else
- err = validate_nsset(&nsset, f.file->private_data);
+ err = validate_nsset(&nsset, pidfd_pid(f.file));
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(current);
diff --git a/kernel/pid.c b/kernel/pid.c
index b52b10865454..99a0c5eb24b8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
+#include <linux/pidfs.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
@@ -65,6 +66,13 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
+#ifdef CONFIG_FS_PID
+/*
+ * Pseudo filesystems start inode numbering after one. We use Reserved
+ * PIDs as a natural offset.
+ */
+static u64 pidfs_ino = RESERVED_PIDS;
+#endif
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -272,6 +280,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
+#ifdef CONFIG_FS_PID
+ pid->stashed = NULL;
+ pid->ino = ++pidfs_ino;
+#endif
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
@@ -349,6 +361,11 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
hlist_del_rcu(&task->pid_links[type]);
*pid_ptr = new;
+ if (type == PIDTYPE_PID) {
+ WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
+ wake_up_all(&pid->wait_pidfd);
+ }
+
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (pid_has_task(pid, tmp))
return;
@@ -391,8 +408,7 @@ void exchange_tids(struct task_struct *left, struct task_struct *right)
void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
- if (type == PIDTYPE_PID)
- new->thread_pid = old->thread_pid;
+ WARN_ON_ONCE(type == PIDTYPE_PID);
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
@@ -552,11 +568,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
* Return the task associated with @pidfd. The function takes a reference on
* the returned task. The caller is responsible for releasing that reference.
*
- * Currently, the process identified by @pidfd is always a thread-group leader.
- * This restriction currently exists for all aspects of pidfds including pidfd
- * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
- * (only supports thread group leaders).
- *
* Return: On success, the task_struct associated with the pidfd.
* On error, a negative errno number will be returned.
*/
@@ -595,7 +606,7 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
-int pidfd_create(struct pid *pid, unsigned int flags)
+static int pidfd_create(struct pid *pid, unsigned int flags)
{
int pidfd;
struct file *pidfd_file;
@@ -615,11 +626,8 @@ int pidfd_create(struct pid *pid, unsigned int flags)
* @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
- * the process identified by @pid. Currently, the process identified by
- * @pid must be a thread-group leader. This restriction currently exists
- * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
- * be used with CLONE_THREAD) and pidfd polling (only supports thread group
- * leaders).
+ * the task identified by @pid. Without PIDFD_THREAD flag the target task
+ * must be a thread-group leader.
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
@@ -629,7 +637,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
int fd;
struct pid *p;
- if (flags & ~PIDFD_NONBLOCK)
+ if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
return -EINVAL;
if (pid <= 0)
@@ -682,7 +690,26 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
up_read(&task->signal->exec_update_lock);
- return file ?: ERR_PTR(-EBADF);
+ if (!file) {
+ /*
+ * It is possible that the target thread is exiting; it can be
+ * either:
+ * 1. before exit_signals(), which gives a real fd
+ * 2. before exit_files() takes the task_lock() gives a real fd
+ * 3. after exit_files() releases task_lock(), ->files is NULL;
+ * this has PF_EXITING, since it was set in exit_signals(),
+ * __pidfd_fget() returns EBADF.
+ * In case 3 we get EBADF, but that really means ESRCH, since
+ * the task is currently exiting and has freed its files
+ * struct, so we fix it up.
+ */
+ if (task->flags & PF_EXITING)
+ file = ERR_PTR(-ESRCH);
+ else
+ file = ERR_PTR(-EBADF);
+ }
+
+ return file;
}
static int pidfd_getfd(struct pid *pid, int fd)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 6053ddddaf65..692f12fe60c1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -222,7 +222,7 @@ int swsusp_swap_in_use(void)
*/
static unsigned short root_swap = 0xffff;
-static struct bdev_handle *hib_resume_bdev_handle;
+static struct file *hib_resume_bdev_file;
struct hib_bio_batch {
atomic_t count;
@@ -276,7 +276,7 @@ static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr,
struct bio *bio;
int error = 0;
- bio = bio_alloc(hib_resume_bdev_handle->bdev, 1, opf,
+ bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf,
GFP_NOIO | __GFP_HIGH);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
@@ -357,14 +357,14 @@ static int swsusp_swap_check(void)
return res;
root_swap = res;
- hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
+ hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device,
BLK_OPEN_WRITE, NULL, NULL);
- if (IS_ERR(hib_resume_bdev_handle))
- return PTR_ERR(hib_resume_bdev_handle);
+ if (IS_ERR(hib_resume_bdev_file))
+ return PTR_ERR(hib_resume_bdev_file);
- res = set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
+ res = set_blocksize(file_bdev(hib_resume_bdev_file), PAGE_SIZE);
if (res < 0)
- bdev_release(hib_resume_bdev_handle);
+ fput(hib_resume_bdev_file);
return res;
}
@@ -1523,10 +1523,10 @@ int swsusp_check(bool exclusive)
void *holder = exclusive ? &swsusp_holder : NULL;
int error;
- hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
+ hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device,
BLK_OPEN_READ, holder, NULL);
- if (!IS_ERR(hib_resume_bdev_handle)) {
- set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
+ if (!IS_ERR(hib_resume_bdev_file)) {
+ set_blocksize(file_bdev(hib_resume_bdev_file), PAGE_SIZE);
clear_page(swsusp_header);
error = hib_submit_io(REQ_OP_READ, swsusp_resume_block,
swsusp_header, NULL);
@@ -1551,11 +1551,11 @@ int swsusp_check(bool exclusive)
put:
if (error)
- bdev_release(hib_resume_bdev_handle);
+ fput(hib_resume_bdev_file);
else
pr_debug("Image signature found, resuming\n");
} else {
- error = PTR_ERR(hib_resume_bdev_handle);
+ error = PTR_ERR(hib_resume_bdev_file);
}
if (error)
@@ -1570,12 +1570,12 @@ put:
void swsusp_close(void)
{
- if (IS_ERR(hib_resume_bdev_handle)) {
+ if (IS_ERR(hib_resume_bdev_file)) {
pr_debug("Image device not initialised\n");
return;
}
- bdev_release(hib_resume_bdev_handle);
+ fput(hib_resume_bdev_file);
}
/**
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index bdd7eadb33d8..e7d2dd267593 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -314,6 +314,19 @@ config RCU_LAZY
To save power, batch RCU callbacks and flush after delay, memory
pressure, or callback list growing too big.
+ Requires rcu_nocbs=all to be set.
+
+ Use rcutree.enable_rcu_lazy=0 to turn it off at boot time.
+
+config RCU_LAZY_DEFAULT_OFF
+ bool "Turn RCU lazy invocation off by default"
+ depends on RCU_LAZY
+ default n
+ help
+ Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default
+ off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch
+ it back on.
+
config RCU_DOUBLE_CHECK_CB_TIME
bool "RCU callback-batch backup time check"
depends on RCU_EXPERT
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index f94f65877f2b..86fce206560e 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -528,6 +528,12 @@ struct task_struct *get_rcu_tasks_gp_kthread(void);
struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+#ifdef CONFIG_TASKS_RCU_GENERIC
+void tasks_cblist_init_generic(void);
+#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
+static inline void tasks_cblist_init_generic(void) { }
+#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
+
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
@@ -543,11 +549,11 @@ enum rcutorture_type {
};
#if defined(CONFIG_RCU_LAZY)
-unsigned long rcu_lazy_get_jiffies_till_flush(void);
-void rcu_lazy_set_jiffies_till_flush(unsigned long j);
+unsigned long rcu_get_jiffies_lazy_flush(void);
+void rcu_set_jiffies_lazy_flush(unsigned long j);
#else
-static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
-static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
+static inline unsigned long rcu_get_jiffies_lazy_flush(void) { return 0; }
+static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { }
#endif
#if defined(CONFIG_TREE_RCU)
@@ -623,12 +629,7 @@ int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j);
void rcu_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq;
-#ifdef CONFIG_RCU_EXP_KTHREAD
extern struct kthread_worker *rcu_exp_gp_kworker;
-extern struct kthread_worker *rcu_exp_par_gp_kworker;
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-extern struct workqueue_struct *rcu_par_gp_wq;
-#endif /* CONFIG_RCU_EXP_KTHREAD */
void rcu_gp_slow_register(atomic_t *rgssp);
void rcu_gp_slow_unregister(atomic_t *rgssp);
#endif /* #else #ifdef CONFIG_TINY_RCU */
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index ffdb30495e3c..8db4fedaaa1e 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -764,9 +764,9 @@ kfree_scale_init(void)
if (kfree_by_call_rcu) {
/* do a test to check the timeout. */
- orig_jif = rcu_lazy_get_jiffies_till_flush();
+ orig_jif = rcu_get_jiffies_lazy_flush();
- rcu_lazy_set_jiffies_till_flush(2 * HZ);
+ rcu_set_jiffies_lazy_flush(2 * HZ);
rcu_barrier();
jif_start = jiffies;
@@ -775,7 +775,7 @@ kfree_scale_init(void)
smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1);
- rcu_lazy_set_jiffies_till_flush(orig_jif);
+ rcu_set_jiffies_lazy_flush(orig_jif);
if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7567ca8e743c..45d6b4c3d199 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1368,9 +1368,13 @@ rcu_torture_writer(void *arg)
struct rcu_torture *rp;
struct rcu_torture *old_rp;
static DEFINE_TORTURE_RANDOM(rand);
+ unsigned long stallsdone = jiffies;
bool stutter_waited;
unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+ // If a new stall test is added, this must be adjusted.
+ if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu)
+ stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
if (!can_expedite)
pr_alert("%s" TORTURE_FLAG
@@ -1576,11 +1580,11 @@ rcu_torture_writer(void *arg)
!atomic_read(&rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
- boot_ended)
+ boot_ended &&
+ time_after(jiffies, stallsdone))
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
if (list_empty(&rcu_tortures[i].rtort_free) &&
- rcu_access_pointer(rcu_torture_current) !=
- &rcu_tortures[i]) {
+ rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) {
tracing_off();
show_rcu_gp_kthreads();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
@@ -2441,7 +2445,8 @@ static struct notifier_block rcu_torture_stall_block = {
/*
* CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
- * induces a CPU stall for the time specified by stall_cpu.
+ * induces a CPU stall for the time specified by stall_cpu. If a new
+ * stall test is added, stallsdone in rcu_torture_writer() must be adjusted.
*/
static int rcu_torture_stall(void *args)
{
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0351a4e83529..e4d673fc30f4 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1234,11 +1234,20 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
/*
- * The snapshot for acceleration must be taken _before_ the read of the
- * current gp sequence used for advancing, otherwise advancing may fail
- * and acceleration may then fail too.
+ * It's crucial to capture the snapshot 's' for acceleration before
+ * reading the current gp_seq that is used for advancing. This is
+ * essential because if the acceleration snapshot is taken after a
+ * failed advancement attempt, there's a risk that a grace period may
+ * conclude and a new one may start in the interim. If the snapshot is
+ * captured after this sequence of events, the acceleration snapshot 's'
+ * could be excessively advanced, leading to acceleration failure.
+ * In such a scenario, an 'acceleration leak' can occur, where new
+ * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment.
+ * Also note that encountering advancing failures is a normal
+ * occurrence when the grace period for RCU_WAIT_TAIL is in progress.
*
- * This could happen if:
+ * To see this, consider the following events which occur if
+ * rcu_seq_snap() were to be called after advance:
*
* 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
* RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
@@ -1264,6 +1273,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
if (rhp) {
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ /*
+ * Acceleration can never fail because the base current gp_seq
+ * used for acceleration is <= the value of gp_seq used for
+ * advancing. This means that RCU_NEXT_TAIL segment will
+ * always be able to be emptied by the acceleration into the
+ * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments.
+ */
WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
}
if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index e550f97779b8..86df878a2fee 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -24,22 +24,6 @@ void rcu_sync_init(struct rcu_sync *rsp)
init_waitqueue_head(&rsp->gp_wait);
}
-/**
- * rcu_sync_enter_start - Force readers onto slow path for multiple updates
- * @rsp: Pointer to rcu_sync structure to use for synchronization
- *
- * Must be called after rcu_sync_init() and before first use.
- *
- * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
- * pairs turn into NO-OPs.
- */
-void rcu_sync_enter_start(struct rcu_sync *rsp)
-{
- rsp->gp_count++;
- rsp->gp_state = GP_PASSED;
-}
-
-
static void rcu_sync_func(struct rcu_head *rhp);
static void rcu_sync_call(struct rcu_sync *rsp)
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 732ad5b39946..147b5945d67a 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -32,6 +32,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @rtp_irq_work: IRQ work queue for deferred wakeups.
* @barrier_q_head: RCU callback for barrier operation.
* @rtp_blkd_tasks: List of tasks blocked as readers.
+ * @rtp_exit_list: List of tasks in the latter portion of do_exit().
* @cpu: CPU number corresponding to this entry.
* @rtpp: Pointer to the rcu_tasks structure.
*/
@@ -46,6 +47,7 @@ struct rcu_tasks_percpu {
struct irq_work rtp_irq_work;
struct rcu_head barrier_q_head;
struct list_head rtp_blkd_tasks;
+ struct list_head rtp_exit_list;
int cpu;
struct rcu_tasks *rtpp;
};
@@ -144,8 +146,6 @@ static struct rcu_tasks rt_name = \
}
#ifdef CONFIG_TASKS_RCU
-/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
@@ -240,7 +240,6 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
static void cblist_init_generic(struct rcu_tasks *rtp)
{
int cpu;
- unsigned long flags;
int lim;
int shift;
@@ -266,15 +265,15 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
WARN_ON_ONCE(!rtpcp);
if (cpu)
raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
- local_irq_save(flags); // serialize initialization
if (rcu_segcblist_empty(&rtpcp->cblist))
rcu_segcblist_init(&rtpcp->cblist);
- local_irq_restore(flags);
INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
rtpcp->cpu = cpu;
rtpcp->rtpp = rtp;
if (!rtpcp->rtp_blkd_tasks.next)
INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ if (!rtpcp->rtp_exit_list.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
}
pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
@@ -851,10 +850,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// number of voluntary context switches, and add that task to the
// holdout list.
// rcu_tasks_postscan():
-// Invoke synchronize_srcu() to ensure that all tasks that were
-// in the process of exiting (and which thus might not know to
-// synchronize with this RCU Tasks grace period) have completed
-// exiting.
+// Gather per-CPU lists of tasks in do_exit() to ensure that all
+// tasks that were in the process of exiting (and which thus might
+// not know to synchronize with this RCU Tasks grace period) have
+// completed exiting. The synchronize_rcu() in rcu_tasks_postgp()
+// will take care of any tasks stuck in the non-preemptible region
+// of do_exit() following its call to exit_tasks_rcu_stop().
// check_all_holdout_tasks(), repeatedly until holdout list is empty:
// Scans the holdout list, attempting to identify a quiescent state
// for each task on the list. If there is a quiescent state, the
@@ -867,8 +868,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// with interrupts disabled.
//
// For each exiting task, the exit_tasks_rcu_start() and
-// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
-// read-side critical sections waited for by rcu_tasks_postscan().
+// exit_tasks_rcu_finish() functions add and remove, respectively, the
+// current task to a per-CPU list of tasks that rcu_tasks_postscan() must
+// wait on. This is necessary because rcu_tasks_postscan() must wait on
+// tasks that have already been removed from the global list of tasks.
//
// Pre-grace-period update-side code is ordered before the grace
// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code
@@ -932,9 +935,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
}
}
+void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
+DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+
/* Processing between scanning taskslist and draining the holdout list. */
static void rcu_tasks_postscan(struct list_head *hop)
{
+ int cpu;
int rtsi = READ_ONCE(rcu_task_stall_info);
if (!IS_ENABLED(CONFIG_TINY_RCU)) {
@@ -948,9 +955,9 @@ static void rcu_tasks_postscan(struct list_head *hop)
* this, divide the fragile exit path part in two intersecting
* read side critical sections:
*
- * 1) An _SRCU_ read side starting before calling exit_notify(),
- * which may remove the task from the tasklist, and ending after
- * the final preempt_disable() call in do_exit().
+ * 1) A task_struct list addition before calling exit_notify(),
+ * which may remove the task from the tasklist, with the
+ * removal after the final preempt_disable() call in do_exit().
*
* 2) An _RCU_ read side starting with the final preempt_disable()
* call in do_exit() and ending with the final call to schedule()
@@ -959,7 +966,37 @@ static void rcu_tasks_postscan(struct list_head *hop)
* This handles the part 1). And postgp will handle part 2) with a
* call to synchronize_rcu().
*/
- synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ for_each_possible_cpu(cpu) {
+ unsigned long j = jiffies + 1;
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu);
+ struct task_struct *t;
+ struct task_struct *t1;
+ struct list_head tmp;
+
+ raw_spin_lock_irq_rcu_node(rtpcp);
+ list_for_each_entry_safe(t, t1, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) {
+ if (list_empty(&t->rcu_tasks_holdout_list))
+ rcu_tasks_pertask(t, hop);
+
+ // RT kernels need frequent pauses, otherwise
+ // pause at least once per pair of jiffies.
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && time_before(jiffies, j))
+ continue;
+
+ // Keep our place in the list while pausing.
+ // Nothing else traverses this list, so adding a
+ // bare list_head is OK.
+ list_add(&tmp, &t->rcu_tasks_exit_list);
+ raw_spin_unlock_irq_rcu_node(rtpcp);
+ cond_resched(); // For CONFIG_PREEMPT=n kernels
+ raw_spin_lock_irq_rcu_node(rtpcp);
+ t1 = list_entry(tmp.next, struct task_struct, rcu_tasks_exit_list);
+ list_del(&tmp);
+ j = jiffies + 1;
+ }
+ raw_spin_unlock_irq_rcu_node(rtpcp);
+ }
if (!IS_ENABLED(CONFIG_TINY_RCU))
del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
@@ -1027,7 +1064,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
*
* In addition, this synchronize_rcu() waits for exiting tasks
* to complete their final preempt_disable() region of execution,
- * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
* enforcing the whole region before tasklist removal until
* the final schedule() with TASK_DEAD state to be an RCU TASKS
* read side critical section.
@@ -1035,9 +1071,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
synchronize_rcu();
}
-void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
-DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
-
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
{
#ifndef CONFIG_TINY_RCU
@@ -1118,7 +1151,6 @@ module_param(rcu_tasks_lazy_ms, int, 0444);
static int __init rcu_spawn_tasks_kthread(void)
{
- cblist_init_generic(&rcu_tasks);
rcu_tasks.gp_sleep = HZ / 10;
rcu_tasks.init_fract = HZ / 10;
if (rcu_tasks_lazy_ms >= 0)
@@ -1147,25 +1179,48 @@ struct task_struct *get_rcu_tasks_gp_kthread(void)
EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Protect against tasklist scan blind spot while the task is exiting and
+ * may be removed from the tasklist. Do this by adding the task to yet
+ * another list.
+ *
+ * Note that the task will remove itself from this list, so there is no
+ * need for get_task_struct(), except in the case where rcu_tasks_pertask()
+ * adds it to the holdout list, in which case rcu_tasks_pertask() supplies
+ * the needed get_task_struct().
*/
-void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_start(void)
{
- current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ struct task_struct *t = current;
+
+ WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list));
+ preempt_disable();
+ rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu);
+ t->rcu_tasks_exit_cpu = smp_processor_id();
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (!rtpcp->rtp_exit_list.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+ list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ preempt_enable();
}
/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Remove the task from the "yet another list" because do_exit() is now
+ * non-preemptible, allowing synchronize_rcu() to wait beyond this point.
*/
-void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_stop(void)
{
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
struct task_struct *t = current;
- __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
+ WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list));
+ rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_del_init(&t->rcu_tasks_exit_list);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
}
/*
@@ -1282,7 +1337,6 @@ module_param(rcu_tasks_rude_lazy_ms, int, 0444);
static int __init rcu_spawn_tasks_rude_kthread(void)
{
- cblist_init_generic(&rcu_tasks_rude);
rcu_tasks_rude.gp_sleep = HZ / 10;
if (rcu_tasks_rude_lazy_ms >= 0)
rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms);
@@ -1914,7 +1968,6 @@ module_param(rcu_tasks_trace_lazy_ms, int, 0444);
static int __init rcu_spawn_tasks_trace_kthread(void)
{
- cblist_init_generic(&rcu_tasks_trace);
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
rcu_tasks_trace.gp_sleep = HZ / 10;
rcu_tasks_trace.init_fract = HZ / 10;
@@ -2086,6 +2139,24 @@ late_initcall(rcu_tasks_verify_schedule_work);
static void rcu_tasks_initiate_self_tests(void) { }
#endif /* #else #ifdef CONFIG_PROVE_RCU */
+void __init tasks_cblist_init_generic(void)
+{
+ lockdep_assert_irqs_disabled();
+ WARN_ON(num_online_cpus() > 1);
+
+#ifdef CONFIG_TASKS_RCU
+ cblist_init_generic(&rcu_tasks);
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ cblist_init_generic(&rcu_tasks_rude);
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ cblist_init_generic(&rcu_tasks_trace);
+#endif
+}
+
void __init rcu_init_tasks_generic(void)
{
#ifdef CONFIG_TASKS_RCU
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index fec804b79080..705c0d16850a 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -261,4 +261,5 @@ void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
rcu_early_boot_tests();
+ tasks_cblist_init_generic();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1ae851777806..d9642dd06c25 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -145,7 +145,7 @@ static int rcu_scheduler_fully_active __read_mostly;
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static struct task_struct *rcu_boost_task(struct rcu_node *rnp);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
@@ -1013,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
return needmore;
}
+static void swake_up_one_online_ipi(void *arg)
+{
+ struct swait_queue_head *wqh = arg;
+
+ swake_up_one(wqh);
+}
+
+static void swake_up_one_online(struct swait_queue_head *wqh)
+{
+ int cpu = get_cpu();
+
+ /*
+ * If called from rcutree_report_cpu_starting(), wake up
+ * is dangerous that late in the CPU-down hotplug process. The
+ * scheduler might queue an ignored hrtimer. Defer the wake up
+ * to an online CPU instead.
+ */
+ if (unlikely(cpu_is_offline(cpu))) {
+ int target;
+
+ target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
+ cpu_online_mask);
+
+ smp_call_function_single(target, swake_up_one_online_ipi,
+ wqh, 0);
+ put_cpu();
+ } else {
+ put_cpu();
+ swake_up_one(wqh);
+ }
+}
+
/*
* Awaken the grace-period kthread. Don't do a self-awaken (unless in an
* interrupt or softirq handler, in which case we just might immediately
@@ -1037,7 +1069,7 @@ static void rcu_gp_kthread_wake(void)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
- swake_up_one(&rcu_state.gp_wq);
+ swake_up_one_online(&rcu_state.gp_wq);
}
/*
@@ -2113,6 +2145,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
* Extract the list of ready callbacks, disabling IRQs to prevent
* races with call_rcu() from interrupt handlers. Leave the
* callback counts, as rcu_barrier() needs to be conservative.
+ *
+ * Callbacks execution is fully ordered against preceding grace period
+ * completion (materialized by rnp->gp_seq update) thanks to the
+ * smp_mb__after_unlock_lock() upon node locking required for callbacks
+ * advancing. In NOCB mode this ordering is then further relayed through
+ * the nocb locking that protects both callbacks advancing and extraction.
*/
rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -2559,12 +2597,26 @@ static int __init rcu_spawn_core_kthreads(void)
return 0;
}
+static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func)
+{
+ rcu_segcblist_enqueue(&rdp->cblist, head);
+ if (__is_kvfree_rcu_offset((unsigned long)func))
+ trace_rcu_kvfree_callback(rcu_state.name, head,
+ (unsigned long)func,
+ rcu_segcblist_n_cbs(&rdp->cblist));
+ else
+ trace_rcu_callback(rcu_state.name, head,
+ rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
+}
+
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
*/
-static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
- unsigned long flags)
+static void call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags)
{
+ rcutree_enqueue(rdp, head, func);
/*
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
@@ -2660,7 +2712,6 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
unsigned long flags;
bool lazy;
struct rcu_data *rdp;
- bool was_alldone;
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
@@ -2697,30 +2748,18 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
}
check_cb_ovld(rdp);
- if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
- return; // Enqueued onto ->nocb_bypass, so just leave.
- // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
- rcu_segcblist_enqueue(&rdp->cblist, head);
- if (__is_kvfree_rcu_offset((unsigned long)func))
- trace_rcu_kvfree_callback(rcu_state.name, head,
- (unsigned long)func,
- rcu_segcblist_n_cbs(&rdp->cblist));
- else
- trace_rcu_callback(rcu_state.name, head,
- rcu_segcblist_n_cbs(&rdp->cblist));
- trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
-
- /* Go handle any RCU core processing required. */
- if (unlikely(rcu_rdp_is_offloaded(rdp))) {
- __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
- } else {
- __call_rcu_core(rdp, head, flags);
- local_irq_restore(flags);
- }
+ if (unlikely(rcu_rdp_is_offloaded(rdp)))
+ call_rcu_nocb(rdp, head, func, flags, lazy);
+ else
+ call_rcu_core(rdp, head, func, flags);
+ local_irq_restore(flags);
}
#ifdef CONFIG_RCU_LAZY
+static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF);
+module_param(enable_rcu_lazy, bool, 0444);
+
/**
* call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
* flush all lazy callbacks (including the new one) to the main ->cblist while
@@ -2746,6 +2785,8 @@ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
__call_rcu_common(head, func, false);
}
EXPORT_SYMBOL_GPL(call_rcu_hurry);
+#else
+#define enable_rcu_lazy false
#endif
/**
@@ -2794,7 +2835,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
+ __call_rcu_common(head, func, enable_rcu_lazy);
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -4362,6 +4403,66 @@ rcu_boot_init_percpu_data(int cpu)
rcu_boot_init_nocb_percpu_data(rdp);
}
+struct kthread_worker *rcu_exp_gp_kworker;
+
+static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
+{
+ struct kthread_worker *kworker;
+ const char *name = "rcu_exp_par_gp_kthread_worker/%d";
+ struct sched_param param = { .sched_priority = kthread_prio };
+ int rnp_index = rnp - rcu_get_root();
+
+ if (rnp->exp_kworker)
+ return;
+
+ kworker = kthread_create_worker(0, name, rnp_index);
+ if (IS_ERR_OR_NULL(kworker)) {
+ pr_err("Failed to create par gp kworker on %d/%d\n",
+ rnp->grplo, rnp->grphi);
+ return;
+ }
+ WRITE_ONCE(rnp->exp_kworker, kworker);
+
+ if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+ sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
+}
+
+static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
+{
+ struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker);
+
+ if (!kworker)
+ return NULL;
+
+ return kworker->task;
+}
+
+static void __init rcu_start_exp_gp_kworker(void)
+{
+ const char *name = "rcu_exp_gp_kthread_worker";
+ struct sched_param param = { .sched_priority = kthread_prio };
+
+ rcu_exp_gp_kworker = kthread_create_worker(0, name);
+ if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+ pr_err("Failed to create %s!\n", name);
+ rcu_exp_gp_kworker = NULL;
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+}
+
+static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp)
+{
+ if (rcu_scheduler_fully_active) {
+ mutex_lock(&rnp->kthread_mutex);
+ rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_exp_par_gp_kworker(rnp);
+ mutex_unlock(&rnp->kthread_mutex);
+ }
+}
+
/*
* Invoked early in the CPU-online process, when pretty much all services
* are available. The incoming CPU is not present.
@@ -4410,7 +4511,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_rnp_kthreads(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
@@ -4418,13 +4519,64 @@ int rcutree_prepare_cpu(unsigned int cpu)
}
/*
- * Update RCU priority boot kthread affinity for CPU-hotplug changes.
+ * Update kthreads affinity during CPU-hotplug changes.
+ *
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question. The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU. If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ *
+ * Any future concurrent calls are serialized via ->kthread_mutex.
*/
-static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
+static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ cpumask_var_t cm;
+ unsigned long mask;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct task_struct *task_boost, *task_exp;
+
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ rnp = rdp->mynode;
+
+ task_boost = rcu_boost_task(rnp);
+ task_exp = rcu_exp_par_gp_task(rnp);
- rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
+ /*
+ * If CPU is the boot one, those tasks are created later from early
+ * initcall since kthreadd must be created first.
+ */
+ if (!task_boost && !task_exp)
+ return;
+
+ if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
+ return;
+
+ mutex_lock(&rnp->kthread_mutex);
+ mask = rcu_rnp_online_cpus(rnp);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
+ cpu != outgoingcpu)
+ cpumask_set_cpu(cpu, cm);
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (cpumask_empty(cm)) {
+ cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (outgoingcpu >= 0)
+ cpumask_clear_cpu(outgoingcpu, cm);
+ }
+
+ if (task_exp)
+ set_cpus_allowed_ptr(task_exp, cm);
+
+ if (task_boost)
+ set_cpus_allowed_ptr(task_boost, cm);
+
+ mutex_unlock(&rnp->kthread_mutex);
+
+ free_cpumask_var(cm);
}
/*
@@ -4608,8 +4760,9 @@ void rcutree_migrate_callbacks(int cpu)
__call_rcu_nocb_wake(my_rdp, true, flags);
} else {
rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
- raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
+ raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
}
+ local_irq_restore(flags);
if (needwake)
rcu_gp_kthread_wake();
lockdep_assert_irqs_enabled();
@@ -4698,51 +4851,6 @@ static int rcu_pm_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-#ifdef CONFIG_RCU_EXP_KTHREAD
-struct kthread_worker *rcu_exp_gp_kworker;
-struct kthread_worker *rcu_exp_par_gp_kworker;
-
-static void __init rcu_start_exp_gp_kworkers(void)
-{
- const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
- const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
- struct sched_param param = { .sched_priority = kthread_prio };
-
- rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
- if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
- pr_err("Failed to create %s!\n", gp_kworker_name);
- return;
- }
-
- rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
- if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
- pr_err("Failed to create %s!\n", par_gp_kworker_name);
- kthread_destroy_worker(rcu_exp_gp_kworker);
- return;
- }
-
- sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
- sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
- &param);
-}
-
-static inline void rcu_alloc_par_gp_wq(void)
-{
-}
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-struct workqueue_struct *rcu_par_gp_wq;
-
-static void __init rcu_start_exp_gp_kworkers(void)
-{
-}
-
-static inline void rcu_alloc_par_gp_wq(void)
-{
- rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
- WARN_ON(!rcu_par_gp_wq);
-}
-#endif /* CONFIG_RCU_EXP_KTHREAD */
-
/*
* Spawn the kthreads that handle RCU's grace periods.
*/
@@ -4777,10 +4885,10 @@ static int __init rcu_spawn_gp_kthread(void)
* due to rcu_scheduler_fully_active.
*/
rcu_spawn_cpu_nocb_kthread(smp_processor_id());
- rcu_spawn_one_boost_kthread(rdp->mynode);
+ rcu_spawn_rnp_kthreads(rdp->mynode);
rcu_spawn_core_kthreads();
/* Create kthread worker for expedited GPs */
- rcu_start_exp_gp_kworkers();
+ rcu_start_exp_gp_kworker();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -4883,7 +4991,7 @@ static void __init rcu_init_one(void)
init_waitqueue_head(&rnp->exp_wq[2]);
init_waitqueue_head(&rnp->exp_wq[3]);
spin_lock_init(&rnp->exp_lock);
- mutex_init(&rnp->boost_kthread_mutex);
+ mutex_init(&rnp->kthread_mutex);
raw_spin_lock_init(&rnp->exp_poll_lock);
rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
@@ -5120,7 +5228,6 @@ void __init rcu_init(void)
/* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
- rcu_alloc_par_gp_wq();
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
@@ -5133,6 +5240,8 @@ void __init rcu_init(void)
(void)start_poll_synchronize_rcu_expedited();
rcu_test_sync_prims();
+
+ tasks_cblist_init_generic();
}
#include "tree_stall.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e9821a8422db..df48160b3136 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -21,14 +21,10 @@
#include "rcu_segcblist.h"
-/* Communicate arguments to a workqueue handler. */
+/* Communicate arguments to a kthread worker handler. */
struct rcu_exp_work {
unsigned long rew_s;
-#ifdef CONFIG_RCU_EXP_KTHREAD
struct kthread_work rew_work;
-#else
- struct work_struct rew_work;
-#endif /* CONFIG_RCU_EXP_KTHREAD */
};
/* RCU's kthread states for tracing. */
@@ -72,6 +68,9 @@ struct rcu_node {
/* Online CPUs for next expedited GP. */
/* Any CPU that has ever been online will */
/* have its bit set. */
+ struct kthread_worker *exp_kworker;
+ /* Workers performing per node expedited GP */
+ /* initialization. */
unsigned long cbovldmask;
/* CPUs experiencing callback overload. */
unsigned long ffmask; /* Fully functional CPUs. */
@@ -113,7 +112,7 @@ struct rcu_node {
/* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
- struct mutex boost_kthread_mutex;
+ struct mutex kthread_mutex;
/* Exclusion for thread spawning and affinity */
/* manipulation. */
struct task_struct *boost_kthread_task;
@@ -467,11 +466,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
unsigned long j, bool lazy);
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags,
- bool lazy);
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
- unsigned long flags);
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy);
+static void __maybe_unused __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6d7cea5d591f..6b83537480b1 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -173,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
return ret;
}
-
/*
* Report the exit from RCU read-side critical section for the last task
* that queued itself during or before the current expedited preemptible-RCU
@@ -199,10 +198,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
}
if (rnp->parent == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (wake) {
- smp_mb(); /* EGP done before wake_up(). */
- swake_up_one(&rcu_state.expedited_wq);
- }
+ if (wake)
+ swake_up_one_online(&rcu_state.expedited_wq);
+
break;
}
mask = rnp->grpmask;
@@ -420,7 +418,6 @@ retry_ipi:
static void rcu_exp_sel_wait_wake(unsigned long s);
-#ifdef CONFIG_RCU_EXP_KTHREAD
static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
{
struct rcu_exp_work *rewp =
@@ -429,9 +426,14 @@ static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
__sync_rcu_exp_select_node_cpus(rewp);
}
-static inline bool rcu_gp_par_worker_started(void)
+static inline bool rcu_exp_worker_started(void)
+{
+ return !!READ_ONCE(rcu_exp_gp_kworker);
+}
+
+static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp)
{
- return !!READ_ONCE(rcu_exp_par_gp_kworker);
+ return !!READ_ONCE(rnp->exp_kworker);
}
static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
@@ -442,7 +444,7 @@ static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
* another work item on the same kthread worker can result in
* deadlock.
*/
- kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
+ kthread_queue_work(READ_ONCE(rnp->exp_kworker), &rnp->rew.rew_work);
}
static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
@@ -467,64 +469,6 @@ static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew
kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
}
-static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
-{
-}
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp =
- container_of(wp, struct rcu_exp_work, rew_work);
-
- __sync_rcu_exp_select_node_cpus(rewp);
-}
-
-static inline bool rcu_gp_par_worker_started(void)
-{
- return !!READ_ONCE(rcu_par_gp_wq);
-}
-
-static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
-{
- int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
-
- INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- /* If all offline, queue the work on an unbound CPU. */
- if (unlikely(cpu > rnp->grphi - rnp->grplo))
- cpu = WORK_CPU_UNBOUND;
- else
- cpu += rnp->grplo;
- queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
-}
-
-static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
-{
- flush_work(&rnp->rew.rew_work);
-}
-
-/*
- * Work-queue handler to drive an expedited grace period forward.
- */
-static void wait_rcu_exp_gp(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp;
-
- rewp = container_of(wp, struct rcu_exp_work, rew_work);
- rcu_exp_sel_wait_wake(rewp->rew_s);
-}
-
-static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
-{
- INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew->rew_work);
-}
-
-static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
-{
- destroy_work_on_stack(&rew->rew_work);
-}
-#endif /* CONFIG_RCU_EXP_KTHREAD */
-
/*
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
@@ -542,7 +486,7 @@ static void sync_rcu_exp_select_cpus(void)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- if (!rcu_gp_par_worker_started() ||
+ if (!rcu_exp_par_worker_started(rnp) ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
/* No worker started yet or last leaf, do direct call. */
@@ -957,7 +901,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
*/
void synchronize_rcu_expedited(void)
{
- bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
unsigned long flags;
struct rcu_exp_work rew;
struct rcu_node *rnp;
@@ -997,7 +940,7 @@ void synchronize_rcu_expedited(void)
return; /* Someone else did our work for us. */
/* Ensure that load happens before action based on it. */
- if (unlikely(boottime)) {
+ if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) {
/* Direct call during scheduler init and early_initcalls(). */
rcu_exp_sel_wait_wake(s);
} else {
@@ -1014,9 +957,6 @@ void synchronize_rcu_expedited(void)
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
-
- if (likely(!boottime))
- synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 4efbf7333d4e..3f85577bddd4 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -256,6 +256,7 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
return __wake_nocb_gp(rdp_gp, rdp, force, flags);
}
+#ifdef CONFIG_RCU_LAZY
/*
* LAZY_FLUSH_JIFFIES decides the maximum amount of time that
* can elapse before lazy callbacks are flushed. Lazy callbacks
@@ -264,21 +265,20 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
* left unsubmitted to RCU after those many jiffies.
*/
#define LAZY_FLUSH_JIFFIES (10 * HZ)
-static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
+static unsigned long jiffies_lazy_flush = LAZY_FLUSH_JIFFIES;
-#ifdef CONFIG_RCU_LAZY
// To be called only from test code.
-void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
+void rcu_set_jiffies_lazy_flush(unsigned long jif)
{
- jiffies_till_flush = jif;
+ jiffies_lazy_flush = jif;
}
-EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
+EXPORT_SYMBOL(rcu_set_jiffies_lazy_flush);
-unsigned long rcu_lazy_get_jiffies_till_flush(void)
+unsigned long rcu_get_jiffies_lazy_flush(void)
{
- return jiffies_till_flush;
+ return jiffies_lazy_flush;
}
-EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
+EXPORT_SYMBOL(rcu_get_jiffies_lazy_flush);
#endif
/*
@@ -299,7 +299,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
*/
if (waketype == RCU_NOCB_WAKE_LAZY &&
rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
- mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
+ mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush());
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
@@ -482,7 +482,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
// flush ->nocb_bypass to ->cblist.
if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
(ncbs && bypass_is_lazy &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) ||
ncbs >= qhimark) {
rcu_nocb_lock(rdp);
*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
@@ -532,9 +532,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
// 2. Both of these conditions are met:
// a. The bypass list previously had only lazy CBs, and:
// b. The new CB is non-lazy.
- if (ncbs && (!bypass_is_lazy || lazy)) {
- local_irq_restore(flags);
- } else {
+ if (!ncbs || (bypass_is_lazy && !lazy)) {
// No-CBs GP kthread might be indefinitely asleep, if so, wake.
rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
@@ -544,7 +542,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
} else {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("FirstBQnoWake"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
}
}
return true; // Callback already enqueued.
@@ -566,11 +564,12 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
long lazy_len;
long len;
struct task_struct *t;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
// If we are being polled or there is no kthread, just leave.
t = READ_ONCE(rdp->nocb_gp_kthread);
if (rcu_nocb_poll || !t) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeNotPoll"));
return;
@@ -583,17 +582,17 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
rdp->qlen_last_fqs_check = len;
// Only lazy CBs in bypass list
if (lazy_len && bypass_len == lazy_len) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
TPS("WakeLazy"));
} else if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
wake_nocb_gp(rdp, false);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
TPS("WakeEmptyIsDeferred"));
}
@@ -610,20 +609,32 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
smp_mb(); /* Enqueue before timer_pending(). */
if ((rdp->nocb_cb_sleep ||
!rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp->nocb_timer)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ !timer_pending(&rdp_gp->nocb_timer)) {
+ rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
TPS("WakeOvfIsDeferred"));
} else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
} else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
}
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy)
+{
+ bool was_alldone;
+
+ if (!rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) {
+ /* Not enqueued on bypass but locked, do regular enqueue */
+ rcutree_enqueue(rdp, head, func);
+ __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
+ }
+}
+
static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
bool *wake_state)
{
@@ -723,7 +734,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
lazy_ncbs = READ_ONCE(rdp->lazy_len);
if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) ||
bypass_ncbs > 2 * qhimark)) {
flush_bypass = true;
} else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
@@ -779,7 +790,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
needwake = rdp->nocb_cb_sleep;
WRITE_ONCE(rdp->nocb_cb_sleep, false);
- smp_mb(); /* CB invocation -after- GP end. */
} else {
needwake = false;
}
@@ -933,8 +943,7 @@ static void nocb_cb_wait(struct rcu_data *rdp)
swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
nocb_cb_wait_cond(rdp));
- // VVV Ensure CB invocation follows _sleep test.
- if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ if (READ_ONCE(rdp->nocb_cb_sleep)) {
WARN_ON(signal_pending(current));
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
}
@@ -1383,7 +1392,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
rcu_nocb_unlock_irqrestore(rdp, flags);
continue;
}
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ rcu_nocb_try_flush_bypass(rdp, jiffies);
rcu_nocb_unlock_irqrestore(rdp, flags);
wake_nocb_gp(rdp, false);
sc->nr_to_scan -= _count;
@@ -1768,10 +1777,10 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
return true;
}
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags, bool lazy)
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy)
{
- return false;
+ WARN_ON_ONCE(1); /* Should be dead code! */
}
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 41021080ad25..36a8b5dbf5b5 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1195,14 +1195,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
struct sched_param sp;
struct task_struct *t;
- mutex_lock(&rnp->boost_kthread_mutex);
- if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
- goto out;
+ if (rnp->boost_kthread_task)
+ return;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (WARN_ON_ONCE(IS_ERR(t)))
- goto out;
+ return;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
@@ -1210,48 +1209,11 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
-
- out:
- mutex_unlock(&rnp->boost_kthread_mutex);
}
-/*
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question. The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU. If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
- *
- * Any future concurrent calls are serialized via ->boost_kthread_mutex.
- */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
{
- struct task_struct *t = rnp->boost_kthread_task;
- unsigned long mask;
- cpumask_var_t cm;
- int cpu;
-
- if (!t)
- return;
- if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
- return;
- mutex_lock(&rnp->boost_kthread_mutex);
- mask = rcu_rnp_online_cpus(rnp);
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
- cpu != outgoingcpu)
- cpumask_set_cpu(cpu, cm);
- cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (cpumask_empty(cm)) {
- cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (outgoingcpu >= 0)
- cpumask_clear_cpu(outgoingcpu, cm);
- }
- set_cpus_allowed_ptr(t, cm);
- mutex_unlock(&rnp->boost_kthread_mutex);
- free_cpumask_var(cm);
+ return READ_ONCE(rnp->boost_kthread_task);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1270,10 +1232,10 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
{
+ return NULL;
}
-
#endif /* #else #ifdef CONFIG_RCU_BOOST */
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..540f229700b6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3955,6 +3955,17 @@ void wake_up_if_idle(int cpu)
}
}
+bool cpus_equal_capacity(int this_cpu, int that_cpu)
+{
+ if (!sched_asym_cpucap_active())
+ return true;
+
+ if (this_cpu == that_cpu)
+ return true;
+
+ return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu);
+}
+
bool cpus_share_cache(int this_cpu, int that_cpu)
{
if (this_cpu == that_cpu)
@@ -6787,10 +6798,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
static void sched_update_worker(struct task_struct *tsk)
{
- if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) {
+ if (tsk->flags & PF_BLOCK_TS)
+ blk_plug_invalidate_ts(tsk);
if (tsk->flags & PF_WQ_WORKER)
wq_worker_running(tsk);
- else
+ else if (tsk->flags & PF_IO_WORKER)
io_wq_worker_running(tsk);
}
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 95c3c097083e..eece6244f9d2 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -133,7 +133,11 @@ unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy)
if (arch_scale_freq_invariant())
return policy->cpuinfo.max_freq;
- return policy->cur;
+ /*
+ * Apply a 25% margin so that we select a higher frequency than
+ * the current one before the CPU is fully busy:
+ */
+ return policy->cur + (policy->cur >> 2);
}
/**
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 2ad881d07752..4e715b9b278e 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -162,6 +162,9 @@
| MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
| MEMBARRIER_CMD_GET_REGISTRATIONS)
+static DEFINE_MUTEX(membarrier_ipi_mutex);
+#define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex)
+
static void ipi_mb(void *info)
{
smp_mb(); /* IPIs should be serializing but paranoid. */
@@ -259,6 +262,7 @@ static int membarrier_global_expedited(void)
if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
+ SERIALIZE_IPI();
cpus_read_lock();
rcu_read_lock();
for_each_online_cpu(cpu) {
@@ -347,6 +351,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
+ SERIALIZE_IPI();
cpus_read_lock();
if (cpu_id >= 0) {
@@ -460,6 +465,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
* between threads which are users of @mm has its membarrier state
* updated.
*/
+ SERIALIZE_IPI();
cpus_read_lock();
rcu_read_lock();
for_each_online_cpu(cpu) {
diff --git a/kernel/signal.c b/kernel/signal.c
index c9c57d053ce4..bdca529f0f7b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -47,6 +47,7 @@
#include <linux/cgroup.h>
#include <linux/audit.h>
#include <linux/sysctl.h>
+#include <uapi/linux/pidfd.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -1436,7 +1437,8 @@ void lockdep_assert_task_sighand_held(struct task_struct *task)
#endif
/*
- * send signal info to all the members of a group
+ * send signal info to all the members of a thread group or to the
+ * individual thread if type == PIDTYPE_PID.
*/
int group_send_sig_info(int sig, struct kernel_siginfo *info,
struct task_struct *p, enum pid_type type)
@@ -1478,7 +1480,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
return ret;
}
-int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
+static int kill_pid_info_type(int sig, struct kernel_siginfo *info,
+ struct pid *pid, enum pid_type type)
{
int error = -ESRCH;
struct task_struct *p;
@@ -1487,11 +1490,10 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (p)
- error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
+ error = group_send_sig_info(sig, info, p, type);
rcu_read_unlock();
if (likely(!p || error != -ESRCH))
return error;
-
/*
* The task was unhashed in between, try again. If it
* is dead, pid_task() will return NULL, if we race with
@@ -1500,6 +1502,11 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
}
}
+int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
+{
+ return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID);
+}
+
static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
int error;
@@ -1898,16 +1905,19 @@ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
return send_sig_info(info.si_signo, &info, t);
}
-int kill_pgrp(struct pid *pid, int sig, int priv)
+static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
int ret;
-
read_lock(&tasklist_lock);
- ret = __kill_pgrp_info(sig, __si_special(priv), pid);
+ ret = __kill_pgrp_info(sig, info, pgrp);
read_unlock(&tasklist_lock);
-
return ret;
}
+
+int kill_pgrp(struct pid *pid, int sig, int priv)
+{
+ return kill_pgrp_info(sig, __si_special(priv), pid);
+}
EXPORT_SYMBOL(kill_pgrp);
int kill_pid(struct pid *pid, int sig, int priv)
@@ -2019,13 +2029,14 @@ ret:
return ret;
}
-static void do_notify_pidfd(struct task_struct *task)
+void do_notify_pidfd(struct task_struct *task)
{
- struct pid *pid;
+ struct pid *pid = task_pid(task);
WARN_ON(task->exit_state == 0);
- pid = task_pid(task);
- wake_up_all(&pid->wait_pidfd);
+
+ __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
+ poll_to_key(EPOLLIN | EPOLLRDNORM));
}
/*
@@ -2050,9 +2061,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
WARN_ON_ONCE(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
-
- /* Wake up all pidfd waiters */
- do_notify_pidfd(tsk);
+ /*
+ * tsk is a group leader and has no threads, wake up the
+ * non-PIDFD_THREAD waiters.
+ */
+ if (thread_group_empty(tsk))
+ do_notify_pidfd(tsk);
if (sig != SIGCHLD) {
/*
@@ -3789,12 +3803,13 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
#endif
#endif
-static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
+static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info,
+ enum pid_type type)
{
clear_siginfo(info);
info->si_signo = sig;
info->si_errno = 0;
- info->si_code = SI_USER;
+ info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER;
info->si_pid = task_tgid_vnr(current);
info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
}
@@ -3808,7 +3823,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct kernel_siginfo info;
- prepare_kill_siginfo(sig, &info);
+ prepare_kill_siginfo(sig, &info, PIDTYPE_TGID);
return kill_something_info(sig, &info, pid);
}
@@ -3861,6 +3876,10 @@ static struct pid *pidfd_to_pid(const struct file *file)
return tgid_pidfd_to_pid(file);
}
+#define PIDFD_SEND_SIGNAL_FLAGS \
+ (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
+ PIDFD_SIGNAL_PROCESS_GROUP)
+
/**
* sys_pidfd_send_signal - Signal a process through a pidfd
* @pidfd: file descriptor of the process
@@ -3868,14 +3887,10 @@ static struct pid *pidfd_to_pid(const struct file *file)
* @info: signal info
* @flags: future flags
*
- * The syscall currently only signals via PIDTYPE_PID which covers
- * kill(<positive-pid>, <signal>. It does not signal threads or process
- * groups.
- * In order to extend the syscall to threads and process groups the @flags
- * argument should be used. In essence, the @flags argument will determine
- * what is signaled and not the file descriptor itself. Put in other words,
- * grouping is a property of the flags argument not a property of the file
- * descriptor.
+ * Send the signal to the thread group or to the individual thread depending
+ * on PIDFD_THREAD.
+ * In the future extension to @flags may be used to override the default scope
+ * of @pidfd.
*
* Return: 0 on success, negative errno on failure
*/
@@ -3886,9 +3901,14 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
struct fd f;
struct pid *pid;
kernel_siginfo_t kinfo;
+ enum pid_type type;
/* Enforce flags be set to 0 until we add an extension. */
- if (flags)
+ if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
+ return -EINVAL;
+
+ /* Ensure that only a single signal scope determining flag is set. */
+ if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
return -EINVAL;
f = fdget(pidfd);
@@ -3906,6 +3926,25 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (!access_pidfd_pidns(pid))
goto err;
+ switch (flags) {
+ case 0:
+ /* Infer scope from the type of pidfd. */
+ if (f.file->f_flags & PIDFD_THREAD)
+ type = PIDTYPE_PID;
+ else
+ type = PIDTYPE_TGID;
+ break;
+ case PIDFD_SIGNAL_THREAD:
+ type = PIDTYPE_PID;
+ break;
+ case PIDFD_SIGNAL_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ break;
+ case PIDFD_SIGNAL_PROCESS_GROUP:
+ type = PIDTYPE_PGID;
+ break;
+ }
+
if (info) {
ret = copy_siginfo_from_user_any(&kinfo, info);
if (unlikely(ret))
@@ -3917,15 +3956,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
/* Only allow sending arbitrary signals to yourself. */
ret = -EPERM;
- if ((task_pid(current) != pid) &&
+ if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
(kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
goto err;
} else {
- prepare_kill_siginfo(sig, &kinfo);
+ prepare_kill_siginfo(sig, &kinfo, type);
}
- ret = kill_pid_info(sig, &kinfo, pid);
-
+ if (type == PIDTYPE_PGID)
+ ret = kill_pgrp_info(sig, &kinfo, pid);
+ else
+ ret = kill_pid_info_type(sig, &kinfo, pid, type);
err:
fdput(f);
return ret;
@@ -3965,12 +4006,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
struct kernel_siginfo info;
- clear_siginfo(&info);
- info.si_signo = sig;
- info.si_errno = 0;
- info.si_code = SI_TKILL;
- info.si_pid = task_tgid_vnr(current);
- info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+ prepare_kill_siginfo(sig, &info, PIDTYPE_PID);
return do_send_specific(tgid, pid, sig, &info);
}
diff --git a/kernel/sys.c b/kernel/sys.c
index e219fcfa112d..f8e543f1e38a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1785,21 +1785,24 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
struct task_struct *t;
unsigned long flags;
u64 tgutime, tgstime, utime, stime;
- unsigned long maxrss = 0;
+ unsigned long maxrss;
+ struct mm_struct *mm;
struct signal_struct *sig = p->signal;
+ unsigned int seq = 0;
- memset((char *)r, 0, sizeof (*r));
+retry:
+ memset(r, 0, sizeof(*r));
utime = stime = 0;
+ maxrss = 0;
if (who == RUSAGE_THREAD) {
task_cputime_adjusted(current, &utime, &stime);
accumulate_thread_rusage(p, r);
maxrss = sig->maxrss;
- goto out;
+ goto out_thread;
}
- if (!lock_task_sighand(p, &flags))
- return;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
switch (who) {
case RUSAGE_BOTH:
@@ -1819,9 +1822,6 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
fallthrough;
case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
r->ru_nvcsw += sig->nvcsw;
r->ru_nivcsw += sig->nivcsw;
r->ru_minflt += sig->min_flt;
@@ -1830,28 +1830,42 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
r->ru_oublock += sig->oublock;
if (maxrss < sig->maxrss)
maxrss = sig->maxrss;
+
+ rcu_read_lock();
__for_each_thread(sig, t)
accumulate_thread_rusage(t, r);
+ rcu_read_unlock();
+
break;
default:
BUG();
}
- unlock_task_sighand(p, &flags);
-out:
- r->ru_utime = ns_to_kernel_old_timeval(utime);
- r->ru_stime = ns_to_kernel_old_timeval(stime);
+ if (need_seqretry(&sig->stats_lock, seq)) {
+ seq = 1;
+ goto retry;
+ }
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- if (who != RUSAGE_CHILDREN) {
- struct mm_struct *mm = get_task_mm(p);
+ if (who == RUSAGE_CHILDREN)
+ goto out_children;
- if (mm) {
- setmax_mm_hiwater_rss(&maxrss, mm);
- mmput(mm);
- }
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+
+out_thread:
+ mm = get_task_mm(p);
+ if (mm) {
+ setmax_mm_hiwater_rss(&maxrss, mm);
+ mmput(mm);
}
+
+out_children:
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
+ r->ru_utime = ns_to_kernel_old_timeval(utime);
+ r->ru_stime = ns_to_kernel_old_timeval(stime);
}
SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c108ed8a9804..3052b1f1168e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -99,6 +99,7 @@ static u64 suspend_start;
* Interval: 0.5sec.
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
/*
* Threshold: 0.0312s, when doubled: 0.0625s.
@@ -134,6 +135,7 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
+static int64_t watchdog_max_interval;
static inline void clocksource_watchdog_lock(unsigned long *flags)
{
@@ -399,8 +401,8 @@ static inline void clocksource_reset_watchdog(void)
static void clocksource_watchdog(struct timer_list *unused)
{
u64 csnow, wdnow, cslast, wdlast, delta;
+ int64_t wd_nsec, cs_nsec, interval;
int next_cpu, reset_pending;
- int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
enum wd_read_status read_ret;
unsigned long extra_wait = 0;
@@ -470,6 +472,27 @@ static void clocksource_watchdog(struct timer_list *unused)
if (atomic_read(&watchdog_reset_pending))
continue;
+ /*
+ * The processing of timer softirqs can get delayed (usually
+ * on account of ksoftirqd not getting to run in a timely
+ * manner), which causes the watchdog interval to stretch.
+ * Skew detection may fail for longer watchdog intervals
+ * on account of fixed margins being used.
+ * Some clocksources, e.g. acpi_pm, cannot tolerate
+ * watchdog intervals longer than a few seconds.
+ */
+ interval = max(cs_nsec, wd_nsec);
+ if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
+ if (system_state > SYSTEM_SCHEDULING &&
+ interval > 2 * watchdog_max_interval) {
+ watchdog_max_interval = interval;
+ pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
+ cs_nsec, wd_nsec);
+ }
+ watchdog_timer.expires = jiffies;
+ continue;
+ }
+
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 760793998cdd..edb0f821dcea 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
enum hrtimer_mode mode)
{
debug_activate(timer, mode);
+ WARN_ON_ONCE(!base->cpu_base->online);
base->cpu_base->active_bases |= 1 << base->index;
@@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->online = 1;
hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
@@ -2250,6 +2252,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
raw_spin_unlock(&new_base->lock);
+ old_base->online = 0;
raw_spin_unlock(&old_base->lock);
return 0;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a17d26002831..01fb50c1b17e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1576,13 +1576,23 @@ void tick_setup_sched_timer(void)
void tick_cancel_sched_timer(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t idle_sleeptime, iowait_sleeptime;
+ unsigned long idle_calls, idle_sleeps;
# ifdef CONFIG_HIGH_RES_TIMERS
if (ts->sched_timer.base)
hrtimer_cancel(&ts->sched_timer);
# endif
+ idle_sleeptime = ts->idle_sleeptime;
+ iowait_sleeptime = ts->iowait_sleeptime;
+ idle_calls = ts->idle_calls;
+ idle_sleeps = ts->idle_sleeps;
memset(ts, 0, sizeof(*ts));
+ ts->idle_sleeptime = idle_sleeptime;
+ ts->iowait_sleeptime = iowait_sleeptime;
+ ts->idle_calls = idle_calls;
+ ts->idle_sleeps = idle_sleeps;
}
#endif
diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c
index ca058c8af6ba..3e5d422dd15c 100644
--- a/kernel/time/time_test.c
+++ b/kernel/time/time_test.c
@@ -73,7 +73,7 @@ static void time64_to_tm_test_date_range(struct kunit *test)
days = div_s64(secs, 86400);
- #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %ld", \
+ #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %lld", \
year, month, mdday, yday, days
KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG);
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 6cd2a4e3afb8..9ff018245840 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -189,9 +189,6 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
{
int size;
- if (num <= 0)
- return -EINVAL;
-
if (!fp->exit_handler) {
fp->rethook = NULL;
return 0;
@@ -199,15 +196,16 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
/* Initialize rethook if needed */
if (fp->nr_maxactive)
- size = fp->nr_maxactive;
+ num = fp->nr_maxactive;
else
- size = num * num_possible_cpus() * 2;
- if (size <= 0)
+ num *= num_possible_cpus() * 2;
+ if (num <= 0)
return -EINVAL;
+ size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size;
+
/* Initialize rethook */
- fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler,
- sizeof(struct fprobe_rethook_node), size);
+ fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num);
if (IS_ERR(fp->rethook))
return PTR_ERR(fp->rethook);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b01ae7d36021..83ba342aef31 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5325,7 +5325,17 @@ static LIST_HEAD(ftrace_direct_funcs);
static int register_ftrace_function_nolock(struct ftrace_ops *ops);
+/*
+ * If there are multiple ftrace_ops, use SAVE_REGS by default, so that direct
+ * call will be jumped from ftrace_regs_caller. Only if the architecture does
+ * not support ftrace_regs_caller but direct_call, use SAVE_ARGS so that it
+ * jumps from ftrace_caller for multiple ftrace_ops.
+ */
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS
#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS)
+#else
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
+#endif
static int check_direct_multi(struct ftrace_ops *ops)
{
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9286f88fcd32..aa332ace108b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -27,6 +27,7 @@
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <asm/local64.h>
#include <asm/local.h>
/*
@@ -317,6 +318,11 @@ struct buffer_data_page {
unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
};
+struct buffer_data_read_page {
+ unsigned order; /* order of the page */
+ struct buffer_data_page *data; /* actual data, stored in this page */
+};
+
/*
* Note, the buffer_page list must be first. The buffer pages
* are allocated in cache lines, which means that each buffer
@@ -331,6 +337,7 @@ struct buffer_page {
unsigned read; /* index for next read */
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
+ unsigned order; /* order of the page */
struct buffer_data_page *page; /* Actual data page */
};
@@ -361,7 +368,7 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
static void free_buffer_page(struct buffer_page *bpage)
{
- free_page((unsigned long)bpage->page);
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
@@ -373,46 +380,10 @@ static inline bool test_time_stamp(u64 delta)
return !!(delta & TS_DELTA_TEST);
}
-#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
-
-/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
-#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-
-int ring_buffer_print_page_header(struct trace_seq *s)
-{
- struct buffer_data_page field;
-
- trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)sizeof(field.time_stamp),
- (unsigned int)is_signed_type(u64));
-
- trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit),
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: int overwrite;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- 1,
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
- (unsigned int)is_signed_type(char));
-
- return !trace_seq_has_overflowed(s);
-}
-
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
- long wait_index;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -463,27 +434,9 @@ enum {
RB_CTX_MAX
};
-#if BITS_PER_LONG == 32
-#define RB_TIME_32
-#endif
-
-/* To test on 64 bit machines */
-//#define RB_TIME_32
-
-#ifdef RB_TIME_32
-
-struct rb_time_struct {
- local_t cnt;
- local_t top;
- local_t bottom;
- local_t msb;
-};
-#else
-#include <asm/local64.h>
struct rb_time_struct {
local64_t time;
};
-#endif
typedef struct rb_time_struct rb_time_t;
#define MAX_NEST 5
@@ -557,6 +510,10 @@ struct trace_buffer {
struct rb_irq_work irq_work;
bool time_stamp_abs;
+
+ unsigned int subbuf_size;
+ unsigned int subbuf_order;
+ unsigned int max_data_size;
};
struct ring_buffer_iter {
@@ -570,150 +527,48 @@ struct ring_buffer_iter {
u64 read_stamp;
u64 page_stamp;
struct ring_buffer_event *event;
+ size_t event_size;
int missed_events;
};
-#ifdef RB_TIME_32
-
-/*
- * On 32 bit machines, local64_t is very expensive. As the ring
- * buffer doesn't need all the features of a true 64 bit atomic,
- * on 32 bit, it uses these functions (64 still uses local64_t).
- *
- * For the ring buffer, 64 bit required operations for the time is
- * the following:
- *
- * - Reads may fail if it interrupted a modification of the time stamp.
- * It will succeed if it did not interrupt another write even if
- * the read itself is interrupted by a write.
- * It returns whether it was successful or not.
- *
- * - Writes always succeed and will overwrite other writes and writes
- * that were done by events interrupting the current write.
- *
- * - A write followed by a read of the same time stamp will always succeed,
- * but may not contain the same value.
- *
- * - A cmpxchg will fail if it interrupted another write or cmpxchg.
- * Other than that, it acts like a normal cmpxchg.
- *
- * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
- * (bottom being the least significant 30 bits of the 60 bit time stamp).
- *
- * The two most significant bits of each half holds a 2 bit counter (0-3).
- * Each update will increment this counter by one.
- * When reading the top and bottom, if the two counter bits match then the
- * top and bottom together make a valid 60 bit number.
- */
-#define RB_TIME_SHIFT 30
-#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
-#define RB_TIME_MSB_SHIFT 60
-
-static inline int rb_time_cnt(unsigned long val)
-{
- return (val >> RB_TIME_SHIFT) & 3;
-}
-
-static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
-{
- u64 val;
-
- val = top & RB_TIME_VAL_MASK;
- val <<= RB_TIME_SHIFT;
- val |= bottom & RB_TIME_VAL_MASK;
-
- return val;
-}
-
-static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
-{
- unsigned long top, bottom, msb;
- unsigned long c;
-
- /*
- * If the read is interrupted by a write, then the cnt will
- * be different. Loop until both top and bottom have been read
- * without interruption.
- */
- do {
- c = local_read(&t->cnt);
- top = local_read(&t->top);
- bottom = local_read(&t->bottom);
- msb = local_read(&t->msb);
- } while (c != local_read(&t->cnt));
-
- *cnt = rb_time_cnt(top);
-
- /* If top, msb or bottom counts don't match, this interrupted a write */
- if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
- return false;
-
- /* The shift to msb will lose its cnt bits */
- *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
- return true;
-}
-
-static bool rb_time_read(rb_time_t *t, u64 *ret)
-{
- unsigned long cnt;
-
- return __rb_time_read(t, ret, &cnt);
-}
-
-static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
-{
- return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
-}
-
-static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
- unsigned long *msb)
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
{
- *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
- *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
- *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
-}
+ struct buffer_data_page field;
-static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
-{
- val = rb_time_val_cnt(val, cnt);
- local_set(t, val);
-}
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
-static void rb_time_set(rb_time_t *t, u64 val)
-{
- unsigned long cnt, top, bottom, msb;
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
- rb_time_split(val, &top, &bottom, &msb);
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
- /* Writes always succeed with a valid number even if it gets interrupted. */
- do {
- cnt = local_inc_return(&t->cnt);
- rb_time_val_set(&t->top, top, cnt);
- rb_time_val_set(&t->bottom, bottom, cnt);
- rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
- } while (cnt != local_read(&t->cnt));
-}
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)buffer->subbuf_size,
+ (unsigned int)is_signed_type(char));
-static inline bool
-rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
-{
- return local_try_cmpxchg(l, &expect, set);
+ return !trace_seq_has_overflowed(s);
}
-#else /* 64 bits */
-
-/* local64_t always succeeds */
-
-static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+static inline void rb_time_read(rb_time_t *t, u64 *ret)
{
*ret = local64_read(&t->time);
- return true;
}
static void rb_time_set(rb_time_t *t, u64 val)
{
local64_set(&t->time, val);
}
-#endif
/*
* Enable this to make sure that the event passed to
@@ -820,10 +675,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
WARN_ONCE(1, "nest (%d) greater than max", nest);
fail:
- /* Can only fail on 32 bit */
- if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
- /* Screw it, just read the current time */
- ts = rb_time_stamp(cpu_buffer->buffer);
+ rb_time_read(&cpu_buffer->write_stamp, &ts);
return ts;
}
@@ -903,8 +755,19 @@ static void rb_wake_up_waiters(struct irq_work *work)
wake_up_all(&rbwork->waiters);
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
+ /* Only cpu_buffer sets the above flags */
+ struct ring_buffer_per_cpu *cpu_buffer =
+ container_of(rbwork, struct ring_buffer_per_cpu, irq_work);
+
+ /* Called from interrupt context */
+ raw_spin_lock(&cpu_buffer->reader_lock);
rbwork->wakeup_full = false;
rbwork->full_waiters_pending = false;
+
+ /* Waking up all waiters, they will reset the shortest full */
+ cpu_buffer->shortest_full = 0;
+ raw_spin_unlock(&cpu_buffer->reader_lock);
+
wake_up_all(&rbwork->full_waiters);
}
}
@@ -945,14 +808,40 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
rbwork = &cpu_buffer->irq_work;
}
- rbwork->wait_index++;
- /* make sure the waiters see the new index */
- smp_wmb();
-
/* This can be called in any context */
irq_work_queue(&rbwork->work);
}
+static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ bool ret = false;
+
+ /* Reads of all CPUs always waits for any data */
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return !ring_buffer_empty(buffer);
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!ring_buffer_empty_cpu(buffer, cpu)) {
+ unsigned long flags;
+ bool pagebusy;
+
+ if (!full)
+ return true;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+ ret = !pagebusy && full_hit(buffer, cpu, full);
+
+ if (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)
+ cpu_buffer->shortest_full = full;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ }
+ return ret;
+}
+
/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
@@ -968,7 +857,6 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
- long wait_index;
int ret = 0;
/*
@@ -987,81 +875,54 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
work = &cpu_buffer->irq_work;
}
- wait_index = READ_ONCE(work->wait_index);
-
- while (true) {
- if (full)
- prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
- else
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
-
- /*
- * The events can happen in critical sections where
- * checking a work queue can cause deadlocks.
- * After adding a task to the queue, this flag is set
- * only to notify events to try to wake up the queue
- * using irq_work.
- *
- * We don't clear it even if the buffer is no longer
- * empty. The flag only causes the next event to run
- * irq_work to do the work queue wake up. The worse
- * that can happen if we race with !trace_empty() is that
- * an event will cause an irq_work to try to wake up
- * an empty queue.
- *
- * There's no reason to protect this flag either, as
- * the work queue and irq_work logic will do the necessary
- * synchronization for the wake ups. The only thing
- * that is necessary is that the wake up happens after
- * a task has been queued. It's OK for spurious wake ups.
- */
- if (full)
- work->full_waiters_pending = true;
- else
- work->waiters_pending = true;
-
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
- break;
-
- if (cpu != RING_BUFFER_ALL_CPUS &&
- !ring_buffer_empty_cpu(buffer, cpu)) {
- unsigned long flags;
- bool pagebusy;
- bool done;
-
- if (!full)
- break;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- done = !pagebusy && full_hit(buffer, cpu, full);
+ if (full)
+ prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
+ else
+ prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
- if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full > full)
- cpu_buffer->shortest_full = full;
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (done)
- break;
- }
+ /*
+ * The events can happen in critical sections where
+ * checking a work queue can cause deadlocks.
+ * After adding a task to the queue, this flag is set
+ * only to notify events to try to wake up the queue
+ * using irq_work.
+ *
+ * We don't clear it even if the buffer is no longer
+ * empty. The flag only causes the next event to run
+ * irq_work to do the work queue wake up. The worse
+ * that can happen if we race with !trace_empty() is that
+ * an event will cause an irq_work to try to wake up
+ * an empty queue.
+ *
+ * There's no reason to protect this flag either, as
+ * the work queue and irq_work logic will do the necessary
+ * synchronization for the wake ups. The only thing
+ * that is necessary is that the wake up happens after
+ * a task has been queued. It's OK for spurious wake ups.
+ */
+ if (full)
+ work->full_waiters_pending = true;
+ else
+ work->waiters_pending = true;
- schedule();
+ if (rb_watermark_hit(buffer, cpu, full))
+ goto out;
- /* Make sure to see the new wait index */
- smp_rmb();
- if (wait_index != work->wait_index)
- break;
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
}
+ schedule();
+ out:
if (full)
finish_wait(&work->full_waiters, &wait);
else
finish_wait(&work->waiters, &wait);
+ if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current))
+ ret = -EINTR;
+
return ret;
}
@@ -1084,28 +945,33 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct rb_irq_work *work;
+ struct rb_irq_work *rbwork;
if (cpu == RING_BUFFER_ALL_CPUS) {
- work = &buffer->irq_work;
+ rbwork = &buffer->irq_work;
full = 0;
} else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return -EINVAL;
+ return EPOLLERR;
cpu_buffer = buffer->buffers[cpu];
- work = &cpu_buffer->irq_work;
+ rbwork = &cpu_buffer->irq_work;
}
if (full) {
- poll_wait(filp, &work->full_waiters, poll_table);
- work->full_waiters_pending = true;
+ unsigned long flags;
+
+ poll_wait(filp, &rbwork->full_waiters, poll_table);
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rbwork->full_waiters_pending = true;
if (!cpu_buffer->shortest_full ||
cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
} else {
- poll_wait(filp, &work->waiters, poll_table);
- work->waiters_pending = true;
+ poll_wait(filp, &rbwork->waiters, poll_table);
+ rbwork->waiters_pending = true;
}
/*
@@ -1619,10 +1485,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
+ cpu_buffer->buffer->subbuf_order);
if (!page)
goto free_pages;
bpage->page = page_address(page);
+ bpage->order = cpu_buffer->buffer->subbuf_order;
rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
@@ -1701,7 +1569,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
rb_check_bpage(cpu_buffer, bpage);
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
if (!page)
goto fail_free_reader;
bpage->page = page_address(page);
@@ -1784,7 +1653,14 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ /* Default buffer page size - one system page */
+ buffer->subbuf_order = 0;
+ buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+
+ /* Max payload is buffer page size - header (8bytes) */
+ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
+
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -2103,7 +1979,7 @@ static void update_pages_handler(struct work_struct *work)
* @size: the new size.
* @cpu_id: the cpu buffer to resize
*
- * Minimum size is 2 * BUF_PAGE_SIZE.
+ * Minimum size is 2 * buffer->subbuf_size.
*
* Returns 0 on success and < 0 on failure.
*/
@@ -2125,7 +2001,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
!cpumask_test_cpu(cpu_id, buffer->cpumask))
return 0;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
/* we need a minimum of two pages */
if (nr_pages < 2)
@@ -2372,7 +2248,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
*/
barrier();
- if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
+ if ((iter->head + length) > commit || length > iter->event_size)
/* Writer corrupted the read? */
goto reset;
@@ -2412,11 +2288,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
}
static __always_inline unsigned
-rb_event_index(struct ring_buffer_event *event)
+rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
- return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
+ addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1;
+
+ return addr - BUF_PAGE_HDR_SIZE;
}
static void rb_inc_iter(struct ring_buffer_iter *iter)
@@ -2605,6 +2483,7 @@ static inline void
rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long tail, struct rb_event_info *info)
{
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
struct buffer_page *tail_page = info->tail_page;
struct ring_buffer_event *event;
unsigned long length = info->length;
@@ -2613,13 +2492,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* Only the event that crossed the page boundary
* must fill the old tail_page with padding.
*/
- if (tail >= BUF_PAGE_SIZE) {
+ if (tail >= bsize) {
/*
* If the page was filled, then we still need
* to update the real_end. Reset it to zero
* and the reader will ignore it.
*/
- if (tail == BUF_PAGE_SIZE)
+ if (tail == bsize)
tail_page->real_end = 0;
local_sub(length, &tail_page->write);
@@ -2647,7 +2526,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* If we are less than the minimum size, we don't need to
* worry about it.
*/
- if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+ if (tail > (bsize - RB_EVNT_MIN_SIZE)) {
/* No room for any events */
/* Mark the rest of the page with padding */
@@ -2662,19 +2541,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
/* Put in a discarded event */
- event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+ event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
/* time delta must be non zero */
event->time_delta = 1;
/* account for padding bytes */
- local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+ local_add(bsize - tail, &cpu_buffer->entries_bytes);
/* Make sure the padding is visible before the tail_page->write update */
smp_wmb();
/* Set write to end of buffer */
- length = (tail + length) - BUF_PAGE_SIZE;
+ length = (tail + length) - bsize;
local_sub(length, &tail_page->write);
}
@@ -2788,7 +2667,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Slow path */
static struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
event->type_len = RINGBUF_TYPE_TIME_STAMP;
@@ -2796,7 +2676,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
/* Not the first event on the page, or not delta? */
- if (abs || rb_event_index(event)) {
+ if (abs || rb_event_index(cpu_buffer, event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
@@ -2826,7 +2706,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
(unsigned long long)info->ts,
(unsigned long long)info->before,
(unsigned long long)info->after,
- (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+ (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}),
sched_clock_stable() ? "" :
"If you just came from a suspend/resume,\n"
"please switch to the trace global clock:\n"
@@ -2870,7 +2750,7 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
if (!abs)
info->delta = 0;
}
- *event = rb_add_time_stamp(*event, info->delta, abs);
+ *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs);
*length -= RB_LEN_TIME_EXTEND;
*delta = 0;
}
@@ -2954,10 +2834,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage;
unsigned long addr;
- new_index = rb_event_index(event);
+ new_index = rb_event_index(cpu_buffer, event);
old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
bpage = READ_ONCE(cpu_buffer->tail_page);
@@ -3344,6 +3224,76 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
#define CHECK_FULL_PAGE 1L
#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
+
+static const char *show_irq_str(int bits)
+{
+ const char *type[] = {
+ ".", // 0
+ "s", // 1
+ "h", // 2
+ "Hs", // 3
+ "n", // 4
+ "Ns", // 5
+ "Nh", // 6
+ "NHs", // 7
+ };
+
+ return type[bits];
+}
+
+/* Assume this is an trace event */
+static const char *show_flags(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+ int bits = 0;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "X";
+
+ entry = ring_buffer_event_data(event);
+
+ if (entry->flags & TRACE_FLAG_SOFTIRQ)
+ bits |= 1;
+
+ if (entry->flags & TRACE_FLAG_HARDIRQ)
+ bits |= 2;
+
+ if (entry->flags & TRACE_FLAG_NMI)
+ bits |= 4;
+
+ return show_irq_str(bits);
+}
+
+static const char *show_irq(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "";
+
+ entry = ring_buffer_event_data(event);
+ if (entry->flags & TRACE_FLAG_IRQS_OFF)
+ return "d";
+ return "";
+}
+
+static const char *show_interrupt_level(void)
+{
+ unsigned long pc = preempt_count();
+ unsigned char level = 0;
+
+ if (pc & SOFTIRQ_OFFSET)
+ level |= 1;
+
+ if (pc & HARDIRQ_MASK)
+ level |= 2;
+
+ if (pc & NMI_MASK)
+ level |= 4;
+
+ return show_irq_str(level);
+}
+
static void dump_buffer_page(struct buffer_data_page *bpage,
struct rb_event_info *info,
unsigned long tail)
@@ -3364,34 +3314,57 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
case RINGBUF_TYPE_TIME_EXTEND:
delta = rb_event_time_stamp(event);
ts += delta;
- pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta);
+ pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n",
+ e, ts, delta);
break;
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
ts = rb_fix_abs_ts(delta, ts);
- pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
+ pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n",
+ e, ts, delta);
break;
case RINGBUF_TYPE_PADDING:
ts += event->time_delta;
- pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta);
+ pr_warn(" 0x%x: [%lld] delta:%d PADDING\n",
+ e, ts, event->time_delta);
break;
case RINGBUF_TYPE_DATA:
ts += event->time_delta;
- pr_warn(" [%lld] delta:%d\n", ts, event->time_delta);
+ pr_warn(" 0x%x: [%lld] delta:%d %s%s\n",
+ e, ts, event->time_delta,
+ show_flags(event), show_irq(event));
break;
default:
break;
}
}
+ pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e);
}
static DEFINE_PER_CPU(atomic_t, checking);
static atomic_t ts_dump;
+#define buffer_warn_return(fmt, ...) \
+ do { \
+ /* If another report is happening, ignore this one */ \
+ if (atomic_inc_return(&ts_dump) != 1) { \
+ atomic_dec(&ts_dump); \
+ goto out; \
+ } \
+ atomic_inc(&cpu_buffer->record_disabled); \
+ pr_warn(fmt, ##__VA_ARGS__); \
+ dump_buffer_page(bpage, info, tail); \
+ atomic_dec(&ts_dump); \
+ /* There's some cases in boot up that this can happen */ \
+ if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \
+ /* Do not re-enable checking */ \
+ return; \
+ } while (0)
+
/*
* Check if the current event time stamp matches the deltas on
* the buffer page.
@@ -3445,7 +3418,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = rb_fix_abs_ts(delta, ts);
+ delta = rb_fix_abs_ts(delta, ts);
+ if (delta < ts) {
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+ cpu_buffer->cpu, ts, delta);
+ }
+ ts = delta;
break;
case RINGBUF_TYPE_PADDING:
@@ -3462,23 +3440,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
}
if ((full && ts > info->ts) ||
(!full && ts + info->delta != info->ts)) {
- /* If another report is happening, ignore this one */
- if (atomic_inc_return(&ts_dump) != 1) {
- atomic_dec(&ts_dump);
- goto out;
- }
- atomic_inc(&cpu_buffer->record_disabled);
- /* There's some cases in boot up that this can happen */
- WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
- pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
- cpu_buffer->cpu,
- ts + info->delta, info->ts, info->delta,
- info->before, info->after,
- full ? " (full)" : "");
- dump_buffer_page(bpage, info, tail);
- atomic_dec(&ts_dump);
- /* Do not re-enable checking */
- return;
+ buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+ cpu_buffer->cpu,
+ ts + info->delta, info->ts, info->delta,
+ info->before, info->after,
+ full ? " (full)" : "", show_interrupt_level());
}
out:
atomic_dec(this_cpu_ptr(&checking));
@@ -3498,16 +3464,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event;
struct buffer_page *tail_page;
unsigned long tail, write, w;
- bool a_ok;
- bool b_ok;
/* Don't let the compiler play games with cpu_buffer->tail_page */
tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
/*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
barrier();
- b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ rb_time_read(&cpu_buffer->write_stamp, &info->after);
barrier();
info->ts = rb_time_stamp(cpu_buffer->buffer);
@@ -3522,7 +3486,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (!w) {
/* Use the sub-buffer timestamp */
info->delta = 0;
- } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
+ } else if (unlikely(info->before != info->after)) {
info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
info->length += RB_LEN_TIME_EXTEND;
} else {
@@ -3544,7 +3508,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
tail = write - info->length;
/* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE)) {
+ if (unlikely(write > cpu_buffer->buffer->subbuf_size)) {
check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
return rb_move_tail(cpu_buffer, tail, info);
}
@@ -3571,8 +3535,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
/* SLOW PATH - Interrupted between A and C */
/* Save the old before_stamp */
- a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- RB_WARN_ON(cpu_buffer, !a_ok);
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
/*
* Read a new timestamp and update the before_stamp to make
@@ -3584,9 +3547,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
rb_time_set(&cpu_buffer->before_stamp, ts);
barrier();
- /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- /* Was interrupted before here, write_stamp must be valid */
- RB_WARN_ON(cpu_buffer, !a_ok);
+ /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after);
barrier();
/*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
info->after == info->before && info->after < ts) {
@@ -3678,7 +3639,7 @@ rb_reserve_next_event(struct trace_buffer *buffer,
if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
add_ts_default = RB_ADD_STAMP_ABSOLUTE;
info.length += RB_LEN_TIME_EXTEND;
- if (info.length > BUF_MAX_DATA_SIZE)
+ if (info.length > cpu_buffer->buffer->max_data_size)
goto out_fail;
} else {
add_ts_default = RB_ADD_STAMP_NONE;
@@ -3753,7 +3714,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
goto out;
- if (unlikely(length > BUF_MAX_DATA_SIZE))
+ if (unlikely(length > buffer->max_data_size))
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -3787,7 +3748,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage = cpu_buffer->commit_page;
struct buffer_page *start;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
/* Do the likely case first */
if (likely(bpage->page == (void *)addr)) {
@@ -3903,7 +3864,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
- if (length > BUF_MAX_DATA_SIZE)
+ if (length > buffer->max_data_size)
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -4483,6 +4444,7 @@ static struct buffer_page *
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
@@ -4618,7 +4580,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
#define USECS_WAIT 1000000
for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
/* If the write is past the end of page, a writer is still updating it */
- if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+ if (likely(!reader || rb_page_write(reader) <= bsize))
break;
udelay(1);
@@ -5062,7 +5024,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
return NULL;
/* Holds the entire event: data and meta data */
- iter->event = kmalloc(BUF_PAGE_SIZE, flags);
+ iter->event_size = buffer->subbuf_size;
+ iter->event = kmalloc(iter->event_size, flags);
if (!iter->event) {
kfree(iter);
return NULL;
@@ -5178,19 +5141,28 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
*/
unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
{
- /*
- * Earlier, this method returned
- * BUF_PAGE_SIZE * buffer->nr_pages
- * Since the nr_pages field is now removed, we have converted this to
- * return the per cpu buffer value.
- */
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
- return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
+ return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages;
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
+/**
+ * ring_buffer_max_event_size - return the max data size of an event
+ * @buffer: The ring buffer.
+ *
+ * Returns the maximum size an event can be.
+ */
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
+{
+ /* If abs timestamp is requested, events have a timestamp too */
+ if (ring_buffer_time_stamp_abs(buffer))
+ return buffer->max_data_size - RB_LEN_TIME_EXTEND;
+ return buffer->max_data_size;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
+
static void rb_clear_buffer_page(struct buffer_page *page)
{
local_set(&page->write, 0);
@@ -5461,6 +5433,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
+ if (buffer_a->subbuf_order != buffer_b->subbuf_order)
+ goto out;
+
ret = -EAGAIN;
if (atomic_read(&buffer_a->record_disabled))
@@ -5532,40 +5507,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* Returns:
* The page allocated, or ERR_PTR
*/
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = NULL;
+ struct buffer_data_read_page *bpage = NULL;
unsigned long flags;
struct page *page;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
+ bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+ if (!bpage)
+ return ERR_PTR(-ENOMEM);
+
+ bpage->order = buffer->subbuf_order;
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
if (cpu_buffer->free_page) {
- bpage = cpu_buffer->free_page;
+ bpage->data = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
}
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
- if (bpage)
+ if (bpage->data)
goto out;
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page) {
+ kfree(bpage);
return ERR_PTR(-ENOMEM);
+ }
- bpage = page_address(page);
+ bpage->data = page_address(page);
out:
- rb_init_page(bpage);
+ rb_init_page(bpage->data);
return bpage;
}
@@ -5575,14 +5558,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
* ring_buffer_free_read_page - free an allocated read page
* @buffer: the buffer the page was allocate for
* @cpu: the cpu buffer the page came from
- * @data: the page to free
+ * @data_page: the page to free
*
* Free a page allocated from ring_buffer_alloc_read_page.
*/
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+ struct buffer_data_read_page *data_page)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = data;
+ struct buffer_data_page *bpage = data_page->data;
struct page *page = virt_to_page(bpage);
unsigned long flags;
@@ -5591,8 +5575,12 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
cpu_buffer = buffer->buffers[cpu];
- /* If the page is still in use someplace else, we can't reuse it */
- if (page_ref_count(page) > 1)
+ /*
+ * If the page is still in use someplace else, or order of the page
+ * is different from the subbuffer order of the buffer -
+ * we can't reuse it
+ */
+ if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order)
goto out;
local_irq_save(flags);
@@ -5607,7 +5595,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
local_irq_restore(flags);
out:
- free_page((unsigned long)bpage);
+ free_pages((unsigned long)bpage, data_page->order);
+ kfree(data_page);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
@@ -5628,9 +5617,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
* if (IS_ERR(rpage))
* return PTR_ERR(rpage);
- * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
+ * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0);
* if (ret >= 0)
- * process_page(rpage, ret);
+ * process_page(ring_buffer_read_page_data(rpage), ret);
+ * ring_buffer_free_read_page(buffer, cpu, rpage);
*
* When @full is set, the function will not return true unless
* the writer is off the reader page.
@@ -5645,7 +5635,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* <0 if no data has been transferred.
*/
int ring_buffer_read_page(struct trace_buffer *buffer,
- void **data_page, size_t len, int cpu, int full)
+ struct buffer_data_read_page *data_page,
+ size_t len, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
@@ -5670,10 +5661,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
len -= BUF_PAGE_HDR_SIZE;
- if (!data_page)
+ if (!data_page || !data_page->data)
+ goto out;
+ if (data_page->order != buffer->subbuf_order)
goto out;
- bpage = *data_page;
+ bpage = data_page->data;
if (!bpage)
goto out;
@@ -5767,11 +5760,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* swap the pages */
rb_init_page(bpage);
bpage = reader->page;
- reader->page = *data_page;
+ reader->page = data_page->data;
local_set(&reader->write, 0);
local_set(&reader->entries, 0);
reader->read = 0;
- *data_page = bpage;
+ data_page->data = bpage;
/*
* Use the real_end for the data size,
@@ -5793,7 +5786,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* If there is room at the end of the page to save the
* missed events, then record it there.
*/
- if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+ if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
memcpy(&bpage->data[commit], &missed_events,
sizeof(missed_events));
local_add(RB_MISSED_STORED, &bpage->commit);
@@ -5805,8 +5798,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/*
* This page may be off to user land. Zero it out here.
*/
- if (commit < BUF_PAGE_SIZE)
- memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+ if (commit < buffer->subbuf_size)
+ memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
out_unlock:
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -5816,6 +5809,213 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+/**
+ * ring_buffer_read_page_data - get pointer to the data in the page.
+ * @page: the page to get the data from
+ *
+ * Returns pointer to the actual data in this page.
+ */
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page)
+{
+ return page->data;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page_data);
+
+/**
+ * ring_buffer_subbuf_size_get - get size of the sub buffer.
+ * @buffer: the buffer to get the sub buffer size from
+ *
+ * Returns size of the sub buffer, in bytes.
+ */
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer)
+{
+ return buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get);
+
+/**
+ * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page.
+ * @buffer: The ring_buffer to get the system sub page order from
+ *
+ * By default, one ring buffer sub page equals to one system page. This parameter
+ * is configurable, per ring buffer. The size of the ring buffer sub page can be
+ * extended, but must be an order of system page size.
+ *
+ * Returns the order of buffer sub page size, in system pages:
+ * 0 means the sub buffer size is 1 system page and so forth.
+ * In case of an error < 0 is returned.
+ */
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer)
+{
+ if (!buffer)
+ return -EINVAL;
+
+ return buffer->subbuf_order;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
+
+/**
+ * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
+ * @buffer: The ring_buffer to set the new page size.
+ * @order: Order of the system pages in one sub buffer page
+ *
+ * By default, one ring buffer pages equals to one system page. This API can be
+ * used to set new size of the ring buffer page. The size must be order of
+ * system page size, that's why the input parameter @order is the order of
+ * system pages that are allocated for one ring buffer page:
+ * 0 - 1 system page
+ * 1 - 2 system pages
+ * 3 - 4 system pages
+ * ...
+ *
+ * Returns 0 on success or < 0 in case of an error.
+ */
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *bpage, *tmp;
+ int old_order, old_size;
+ int nr_pages;
+ int psize;
+ int err;
+ int cpu;
+
+ if (!buffer || order < 0)
+ return -EINVAL;
+
+ if (buffer->subbuf_order == order)
+ return 0;
+
+ psize = (1 << order) * PAGE_SIZE;
+ if (psize <= BUF_PAGE_HDR_SIZE)
+ return -EINVAL;
+
+ /* Size of a subbuf cannot be greater than the write counter */
+ if (psize > RB_WRITE_MASK + 1)
+ return -EINVAL;
+
+ old_order = buffer->subbuf_order;
+ old_size = buffer->subbuf_size;
+
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+ atomic_inc(&buffer->record_disabled);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ buffer->subbuf_order = order;
+ buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
+
+ /* Make sure all new buffers are allocated, before deleting the old ones */
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Update the number of pages to match the new size */
+ nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
+ nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
+
+ /* we need a minimum of two pages */
+ if (nr_pages < 2)
+ nr_pages = 2;
+
+ cpu_buffer->nr_pages_to_update = nr_pages;
+
+ /* Include the reader page */
+ nr_pages++;
+
+ /* Allocate the new size buffer */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer, nr_pages,
+ &cpu_buffer->new_pages)) {
+ /* not enough memory for new pages */
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Clear the head bit to make the link list normal to read */
+ rb_head_page_deactivate(cpu_buffer);
+
+ /* Now walk the list and free all the old sub buffers */
+ list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ /* The above loop stopped an the last page needing to be freed */
+ bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ free_buffer_page(bpage);
+
+ /* Free the current reader page */
+ free_buffer_page(cpu_buffer->reader_page);
+
+ /* One page was allocated for the reader page */
+ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
+ struct buffer_page, list);
+ list_del_init(&cpu_buffer->reader_page->list);
+
+ /* The cpu_buffer pages are a link list with no head */
+ cpu_buffer->pages = cpu_buffer->new_pages.next;
+ cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
+ cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
+
+ /* Clear the new_pages list */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
+ cpu_buffer->nr_pages_to_update = 0;
+
+ free_pages((unsigned long)cpu_buffer->free_page, old_order);
+ cpu_buffer->free_page = NULL;
+
+ rb_head_page_activate(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+ }
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ return 0;
+
+error:
+ buffer->subbuf_order = old_order;
+ buffer->subbuf_size = old_size;
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index aef34673d79d..008187ebd7fe 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -104,10 +104,11 @@ static enum event_status read_event(int cpu)
static enum event_status read_page(int cpu)
{
+ struct buffer_data_read_page *bpage;
struct ring_buffer_event *event;
struct rb_page *rpage;
unsigned long commit;
- void *bpage;
+ int page_size;
int *entry;
int ret;
int inc;
@@ -117,14 +118,15 @@ static enum event_status read_page(int cpu)
if (IS_ERR(bpage))
return EVENT_DROPPED;
- ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+ page_size = ring_buffer_subbuf_size_get(buffer);
+ ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1);
if (ret >= 0) {
- rpage = bpage;
+ rpage = ring_buffer_read_page_data(bpage);
/* The commit may have missed event flags set, clear them */
commit = local_read(&rpage->commit) & 0xfffff;
for (i = 0; i < commit && !test_error ; i += inc) {
- if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+ if (i >= (page_size - offsetof(struct rb_page, data))) {
TEST_ERROR();
break;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a0defe156b57..c9c898307348 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/panic_notifier.h>
+#include <linux/kmemleak.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
@@ -1263,10 +1264,17 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
+ int order;
int ret;
if (!tr->allocated_snapshot) {
+ /* Make the snapshot buffer have the same order as main buffer */
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ if (ret < 0)
+ return ret;
+
/* allocate spare buffer */
ret = resize_buffer_duplicate_size(&tr->max_buffer,
&tr->array_buffer, RING_BUFFER_ALL_CPUS);
@@ -1286,6 +1294,7 @@ static void free_snapshot(struct trace_array *tr)
* The max_tr ring buffer has some state (e.g. ring->clock) and
* we want preserve it.
*/
+ ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0);
ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
set_buffer_entries(&tr->max_buffer, 1);
tracing_reset_online_cpus(&tr->max_buffer);
@@ -1524,7 +1533,7 @@ void disable_trace_on_warning(void)
bool tracer_tracing_is_on(struct trace_array *tr)
{
if (tr->array_buffer.buffer)
- return ring_buffer_record_is_on(tr->array_buffer.buffer);
+ return ring_buffer_record_is_set_on(tr->array_buffer.buffer);
return !tr->buffer_disabled;
}
@@ -2312,7 +2321,7 @@ struct saved_cmdlines_buffer {
unsigned *map_cmdline_to_pid;
unsigned cmdline_num;
int cmdline_idx;
- char *saved_cmdlines;
+ char saved_cmdlines[];
};
static struct saved_cmdlines_buffer *savedcmd;
@@ -2326,47 +2335,60 @@ static inline void set_cmdline(int idx, const char *cmdline)
strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}
-static int allocate_cmdlines_buffer(unsigned int val,
- struct saved_cmdlines_buffer *s)
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
+{
+ int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
+
+ kfree(s->map_cmdline_to_pid);
+ kmemleak_free(s);
+ free_pages((unsigned long)s, order);
+}
+
+static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
{
+ struct saved_cmdlines_buffer *s;
+ struct page *page;
+ int orig_size, size;
+ int order;
+
+ /* Figure out how much is needed to hold the given number of cmdlines */
+ orig_size = sizeof(*s) + val * TASK_COMM_LEN;
+ order = get_order(orig_size);
+ size = 1 << (order + PAGE_SHIFT);
+ page = alloc_pages(GFP_KERNEL, order);
+ if (!page)
+ return NULL;
+
+ s = page_address(page);
+ kmemleak_alloc(s, size, 1, GFP_KERNEL);
+ memset(s, 0, sizeof(*s));
+
+ /* Round up to actual allocation */
+ val = (size - sizeof(*s)) / TASK_COMM_LEN;
+ s->cmdline_num = val;
+
s->map_cmdline_to_pid = kmalloc_array(val,
sizeof(*s->map_cmdline_to_pid),
GFP_KERNEL);
- if (!s->map_cmdline_to_pid)
- return -ENOMEM;
-
- s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL);
- if (!s->saved_cmdlines) {
- kfree(s->map_cmdline_to_pid);
- return -ENOMEM;
+ if (!s->map_cmdline_to_pid) {
+ free_saved_cmdlines_buffer(s);
+ return NULL;
}
s->cmdline_idx = 0;
- s->cmdline_num = val;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
val * sizeof(*s->map_cmdline_to_pid));
- return 0;
+ return s;
}
static int trace_create_savedcmd(void)
{
- int ret;
-
- savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
- if (!savedcmd)
- return -ENOMEM;
-
- ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
- if (ret < 0) {
- kfree(savedcmd);
- savedcmd = NULL;
- return -ENOMEM;
- }
+ savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
- return 0;
+ return savedcmd ? 0 : -ENOMEM;
}
int is_tracing_stopped(void)
@@ -3767,7 +3789,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
/* OK if part of the temp seq buffer */
if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
- (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+ (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE))
return true;
/* Core rodata can not be freed */
@@ -5032,7 +5054,7 @@ static int tracing_release(struct inode *inode, struct file *file)
return 0;
}
-static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+int tracing_release_generic_tr(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -6048,26 +6070,14 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
-{
- kfree(s->saved_cmdlines);
- kfree(s->map_cmdline_to_pid);
- kfree(s);
-}
-
static int tracing_resize_saved_cmdlines(unsigned int val)
{
struct saved_cmdlines_buffer *s, *savedcmd_temp;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = allocate_cmdlines_buffer(val);
if (!s)
return -ENOMEM;
- if (allocate_cmdlines_buffer(val, s) < 0) {
- kfree(s);
- return -ENOMEM;
- }
-
preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
@@ -6946,8 +6956,8 @@ waitagain:
goto out;
}
- if (cnt >= PAGE_SIZE)
- cnt = PAGE_SIZE - 1;
+ if (cnt >= TRACE_SEQ_BUFFER_SIZE)
+ cnt = TRACE_SEQ_BUFFER_SIZE - 1;
/* reset all but tr, trace, and overruns */
trace_iterator_reset(iter);
@@ -7283,6 +7293,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
return 0;
}
+#define TRACE_MARKER_MAX_SIZE 4096
+
static ssize_t
tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -7292,8 +7304,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
struct print_entry *entry;
+ int meta_size;
ssize_t written;
- int size;
+ size_t size;
int len;
/* Used in tracing_mark_raw_write() as well */
@@ -7306,12 +7319,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (!(tr->trace_flags & TRACE_ITER_MARKERS))
return -EINVAL;
- if (cnt > TRACE_BUF_SIZE)
- cnt = TRACE_BUF_SIZE;
+ if ((ssize_t)cnt < 0)
+ return -EINVAL;
- BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+ if (cnt > TRACE_MARKER_MAX_SIZE)
+ cnt = TRACE_MARKER_MAX_SIZE;
- size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
+ meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */
+ again:
+ size = cnt + meta_size;
/* If less than "<faulted>", then make sure we can still add that */
if (cnt < FAULTED_SIZE)
@@ -7320,9 +7336,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
tracing_gen_ctx());
- if (unlikely(!event))
+ if (unlikely(!event)) {
+ /*
+ * If the size was greater than what was allowed, then
+ * make it smaller and try again.
+ */
+ if (size > ring_buffer_max_event_size(buffer)) {
+ /* cnt < FAULTED size should never be bigger than max */
+ if (WARN_ON_ONCE(cnt < FAULTED_SIZE))
+ return -EBADF;
+ cnt = ring_buffer_max_event_size(buffer) - meta_size;
+ /* The above should only happen once */
+ if (WARN_ON_ONCE(cnt + meta_size == size))
+ return -EBADF;
+ goto again;
+ }
+
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
+ }
entry = ring_buffer_event_data(event);
entry->ip = _THIS_IP_;
@@ -7357,9 +7389,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
return written;
}
-/* Limit it for now to 3K (including tag) */
-#define RAW_DATA_MAX_SIZE (1024*3)
-
static ssize_t
tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -7381,19 +7410,18 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
return -EINVAL;
/* The marker must at least have a tag id */
- if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE)
+ if (cnt < sizeof(unsigned int))
return -EINVAL;
- if (cnt > TRACE_BUF_SIZE)
- cnt = TRACE_BUF_SIZE;
-
- BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
-
size = sizeof(*entry) + cnt;
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
buffer = tr->array_buffer.buffer;
+
+ if (size > ring_buffer_max_event_size(buffer))
+ return -EINVAL;
+
event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
tracing_gen_ctx());
if (!event)
@@ -7578,6 +7606,7 @@ struct ftrace_buffer_info {
struct trace_iterator iter;
void *spare;
unsigned int spare_cpu;
+ unsigned int spare_size;
unsigned int read;
};
@@ -8282,6 +8311,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
+ void *trace_data;
+ int page_size;
ssize_t ret = 0;
ssize_t size;
@@ -8293,6 +8324,17 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return -EBUSY;
#endif
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+
+ /* Make sure the spare matches the current sub buffer size */
+ if (info->spare) {
+ if (page_size != info->spare_size) {
+ ring_buffer_free_read_page(iter->array_buffer->buffer,
+ info->spare_cpu, info->spare);
+ info->spare = NULL;
+ }
+ }
+
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
iter->cpu_file);
@@ -8301,19 +8343,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->spare = NULL;
} else {
info->spare_cpu = iter->cpu_file;
+ info->spare_size = page_size;
}
}
if (!info->spare)
return ret;
/* Do we have previous read data to read? */
- if (info->read < PAGE_SIZE)
+ if (info->read < page_size)
goto read;
again:
trace_access_lock(iter->cpu_file);
ret = ring_buffer_read_page(iter->array_buffer->buffer,
- &info->spare,
+ info->spare,
count,
iter->cpu_file, 0);
trace_access_unlock(iter->cpu_file);
@@ -8334,11 +8377,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->read = 0;
read:
- size = PAGE_SIZE - info->read;
+ size = page_size - info->read;
if (size > count)
size = count;
-
- ret = copy_to_user(ubuf, info->spare + info->read, size);
+ trace_data = ring_buffer_read_page_data(info->spare);
+ ret = copy_to_user(ubuf, trace_data + info->read, size);
if (ret == size)
return -EFAULT;
@@ -8350,6 +8393,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return size;
}
+static int tracing_buffers_flush(struct file *file, fl_owner_t id)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
+ return 0;
+}
+
static int tracing_buffers_release(struct inode *inode, struct file *file)
{
struct ftrace_buffer_info *info = file->private_data;
@@ -8361,12 +8418,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
__trace_array_put(iter->tr);
- iter->wait_index++;
- /* Make sure the waiters see the new wait_index */
- smp_wmb();
-
- ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
-
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
@@ -8449,6 +8500,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
+ int page_size;
int entries, i;
ssize_t ret = 0;
@@ -8457,13 +8509,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -EBUSY;
#endif
- if (*ppos & (PAGE_SIZE - 1))
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+ if (*ppos & (page_size - 1))
return -EINVAL;
- if (len & (PAGE_SIZE - 1)) {
- if (len < PAGE_SIZE)
+ if (len & (page_size - 1)) {
+ if (len < page_size)
return -EINVAL;
- len &= PAGE_MASK;
+ len &= (~(page_size - 1));
}
if (splice_grow_spd(pipe, &spd))
@@ -8473,7 +8526,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
- for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) {
struct page *page;
int r;
@@ -8494,7 +8547,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
}
ref->cpu = iter->cpu_file;
- r = ring_buffer_read_page(ref->buffer, &ref->page,
+ r = ring_buffer_read_page(ref->buffer, ref->page,
len, iter->cpu_file, 1);
if (r < 0) {
ring_buffer_free_read_page(ref->buffer, ref->cpu,
@@ -8503,14 +8556,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- page = virt_to_page(ref->page);
+ page = virt_to_page(ring_buffer_read_page_data(ref->page));
spd.pages[i] = page;
- spd.partial[i].len = PAGE_SIZE;
+ spd.partial[i].len = page_size;
spd.partial[i].offset = 0;
spd.partial[i].private = (unsigned long)ref;
spd.nr_pages++;
- *ppos += PAGE_SIZE;
+ *ppos += page_size;
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
}
@@ -8580,6 +8633,7 @@ static const struct file_operations tracing_buffers_fops = {
.read = tracing_buffers_read,
.poll = tracing_buffers_poll,
.release = tracing_buffers_release,
+ .flush = tracing_buffers_flush,
.splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
@@ -9354,6 +9408,103 @@ static const struct file_operations buffer_percent_fops = {
.llseek = default_llseek,
};
+static ssize_t
+buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ size_t size;
+ char buf[64];
+ int order;
+ int r;
+
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ size = (PAGE_SIZE << order) / 1024;
+
+ r = sprintf(buf, "%zd\n", size);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+buffer_subbuf_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ unsigned long val;
+ int old_order;
+ int order;
+ int pages;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ val *= 1024; /* value passed in is in KB */
+
+ pages = DIV_ROUND_UP(val, PAGE_SIZE);
+ order = fls(pages - 1);
+
+ /* limit between 1 and 128 system pages */
+ if (order < 0 || order > 7)
+ return -EINVAL;
+
+ /* Do not allow tracing while changing the order of the ring buffer */
+ tracing_stop_tr(tr);
+
+ old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ if (old_order == order)
+ goto out;
+
+ ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order);
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+
+ if (!tr->allocated_snapshot)
+ goto out_max;
+
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ if (ret) {
+ /* Put back the old order */
+ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
+ if (WARN_ON_ONCE(cnt)) {
+ /*
+ * AARGH! We are left with different orders!
+ * The max buffer is our "snapshot" buffer.
+ * When a tracer needs a snapshot (one of the
+ * latency tracers), it swaps the max buffer
+ * with the saved snap shot. We succeeded to
+ * update the order of the main buffer, but failed to
+ * update the order of the max buffer. But when we tried
+ * to reset the main buffer to the original size, we
+ * failed there too. This is very unlikely to
+ * happen, but if it does, warn and kill all
+ * tracing.
+ */
+ tracing_disabled = 1;
+ }
+ goto out;
+ }
+ out_max:
+#endif
+ (*ppos)++;
+ out:
+ if (ret)
+ cnt = ret;
+ tracing_start_tr(tr);
+ return cnt;
+}
+
+static const struct file_operations buffer_subbuf_size_fops = {
+ .open = tracing_open_generic_tr,
+ .read = buffer_subbuf_size_read,
+ .write = buffer_subbuf_size_write,
+ .release = tracing_release_generic_tr,
+ .llseek = default_llseek,
+};
+
static struct dentry *trace_instance_dir;
static void
@@ -9504,7 +9655,8 @@ static int trace_array_create_dir(struct trace_array *tr)
return ret;
}
-static struct trace_array *trace_array_create(const char *name)
+static struct trace_array *
+trace_array_create_systems(const char *name, const char *systems)
{
struct trace_array *tr;
int ret;
@@ -9524,6 +9676,12 @@ static struct trace_array *trace_array_create(const char *name)
if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (systems) {
+ tr->system_names = kstrdup_const(systems, GFP_KERNEL);
+ if (!tr->system_names)
+ goto out_free_tr;
+ }
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9570,12 +9728,18 @@ static struct trace_array *trace_array_create(const char *name)
free_trace_buffers(tr);
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
return ERR_PTR(ret);
}
+static struct trace_array *trace_array_create(const char *name)
+{
+ return trace_array_create_systems(name, NULL);
+}
+
static int instance_mkdir(const char *name)
{
struct trace_array *tr;
@@ -9601,6 +9765,7 @@ out_unlock:
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created.
+ * @systems: A list of systems to create event directories for (NULL for all)
*
* Returns pointer to trace array with given name.
* NULL, if it cannot be created.
@@ -9614,7 +9779,7 @@ out_unlock:
* trace_array_put() is called, user space can not delete it.
*
*/
-struct trace_array *trace_array_get_by_name(const char *name)
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
{
struct trace_array *tr;
@@ -9626,7 +9791,7 @@ struct trace_array *trace_array_get_by_name(const char *name)
goto out_unlock;
}
- tr = trace_array_create(name);
+ tr = trace_array_create_systems(name, systems);
if (IS_ERR(tr))
tr = NULL;
@@ -9673,6 +9838,7 @@ static int __remove_instance(struct trace_array *tr)
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
@@ -9805,6 +9971,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
tr, &buffer_percent_fops);
+ trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
+ tr, &buffer_subbuf_size_fops);
+
create_trace_options_dir(tr);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -10391,7 +10560,7 @@ __init static void enable_instances(void)
if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
do_allocate_snapshot(tok);
- tr = trace_array_get_by_name(tok);
+ tr = trace_array_get_by_name(tok, NULL);
if (!tr) {
pr_warn("Failed to create instance buffer %s\n", curr_str);
continue;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0489e72c8169..00f873910c5d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,7 @@ struct trace_array {
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
+ const char *system_names;
struct list_head err_log;
struct dentry *dir;
struct dentry *options;
@@ -615,6 +616,7 @@ void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_release_generic_tr(struct inode *inode, struct file *file);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7ccc7a8e155b..dbe29b4c6a7a 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node)
if (!p || *p == '\0')
continue;
- tr = trace_array_get_by_name(p);
+ tr = trace_array_get_by_name(p, NULL);
if (!tr) {
pr_err("Failed to get trace instance %s\n", p);
continue;
diff --git a/kernel/trace/trace_btf.c b/kernel/trace/trace_btf.c
index ca224d53bfdc..5bbdbcbbde3c 100644
--- a/kernel/trace/trace_btf.c
+++ b/kernel/trace/trace_btf.c
@@ -91,8 +91,8 @@ retry:
for_each_member(i, type, member) {
if (!member->name_off) {
/* Anonymous union/struct: push it for later use */
- type = btf_type_skip_modifiers(btf, member->type, &tid);
- if (type && top < BTF_ANON_STACK_MAX) {
+ if (btf_type_skip_modifiers(btf, member->type, &tid) &&
+ top < BTF_ANON_STACK_MAX) {
anon_stack[top].tid = tid;
anon_stack[top++].offset =
cur_offset + member->offset;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f29e815ca5b2..7c364b87352e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1893,9 +1893,9 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
static ssize_t
-show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- int (*func)(struct trace_seq *s) = filp->private_data;
+ struct trace_array *tr = filp->private_data;
struct trace_seq *s;
int r;
@@ -1908,7 +1908,31 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
trace_seq_init(s);
- func(s);
+ ring_buffer_print_page_header(tr->array_buffer.buffer, s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
+
+ kfree(s);
+
+ return r;
+}
+
+static ssize_t
+show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_entry_header(s);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, trace_seq_used(s));
@@ -2165,10 +2189,18 @@ static const struct file_operations ftrace_tr_enable_fops = {
.release = subsystem_release,
};
-static const struct file_operations ftrace_show_header_fops = {
- .open = tracing_open_generic,
- .read = show_header,
+static const struct file_operations ftrace_show_header_page_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_page_file,
+ .llseek = default_llseek,
+ .release = tracing_release_generic_tr,
+};
+
+static const struct file_operations ftrace_show_header_event_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_event_file,
.llseek = default_llseek,
+ .release = tracing_release_generic_tr,
};
static int
@@ -2896,6 +2928,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
up_write(&trace_event_sem);
}
+static bool event_in_systems(struct trace_event_call *call,
+ const char *systems)
+{
+ const char *system;
+ const char *p;
+
+ if (!systems)
+ return true;
+
+ system = call->class->system;
+ p = strstr(systems, system);
+ if (!p)
+ return false;
+
+ if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
+ return false;
+
+ p += strlen(system);
+ return !*p || isspace(*p) || *p == ',';
+}
+
static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
@@ -2905,9 +2958,12 @@ trace_create_new_event(struct trace_event_call *call,
struct trace_event_file *file;
unsigned int first;
+ if (!event_in_systems(call, tr->system_names))
+ return NULL;
+
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
- return NULL;
+ return ERR_PTR(-ENOMEM);
pid_list = rcu_dereference_protected(tr->filtered_pids,
lockdep_is_held(&event_mutex));
@@ -2972,8 +3028,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
struct trace_event_file *file;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
if (eventdir_initialized)
return event_create_dir(tr->event_dir, file);
@@ -3012,8 +3077,17 @@ __trace_early_add_new_event(struct trace_event_call *call,
int ret;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
ret = event_define_fields(call);
if (ret)
@@ -3752,17 +3826,16 @@ static int events_callback(const char *name, umode_t *mode, void **data,
return 1;
}
- if (strcmp(name, "header_page") == 0)
- *data = ring_buffer_print_page_header;
-
- else if (strcmp(name, "header_event") == 0)
- *data = ring_buffer_print_entry_header;
+ if (strcmp(name, "header_page") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_page_fops;
- else
+ } else if (strcmp(name, "header_event") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_event_fops;
+ } else
return 0;
- *mode = TRACE_MODE_READ;
- *fops = &ftrace_show_header_fops;
return 1;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5ecf3c8bde20..6ece1308d36a 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -4805,36 +4805,35 @@ static int parse_actions(struct hist_trigger_data *hist_data)
int len;
for (i = 0; i < hist_data->attrs->n_actions; i++) {
+ enum handler_id hid = 0;
+ char *action_str;
+
str = hist_data->attrs->action_str[i];
- if ((len = str_has_prefix(str, "onmatch("))) {
- char *action_str = str + len;
+ if ((len = str_has_prefix(str, "onmatch(")))
+ hid = HANDLER_ONMATCH;
+ else if ((len = str_has_prefix(str, "onmax(")))
+ hid = HANDLER_ONMAX;
+ else if ((len = str_has_prefix(str, "onchange(")))
+ hid = HANDLER_ONCHANGE;
- data = onmatch_parse(tr, action_str);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else if ((len = str_has_prefix(str, "onmax("))) {
- char *action_str = str + len;
+ action_str = str + len;
- data = track_data_parse(hist_data, action_str,
- HANDLER_ONMAX);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else if ((len = str_has_prefix(str, "onchange("))) {
- char *action_str = str + len;
+ switch (hid) {
+ case HANDLER_ONMATCH:
+ data = onmatch_parse(tr, action_str);
+ break;
+ case HANDLER_ONMAX:
+ case HANDLER_ONCHANGE:
+ data = track_data_parse(hist_data, action_str, hid);
+ break;
+ default:
+ data = ERR_PTR(-EINVAL);
+ break;
+ }
- data = track_data_parse(hist_data, action_str,
- HANDLER_ONCHANGE);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else {
- ret = -EINVAL;
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
break;
}
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index e7af286af4f1..c82b401a294d 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -441,8 +441,9 @@ static unsigned int trace_string(struct synth_trace_event *entry,
if (is_dynamic) {
union trace_synth_field *data = &entry->fields[*n_u64];
+ len = fetch_store_strlen((unsigned long)str_val);
data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size;
- data->as_dynamic.len = fetch_store_strlen((unsigned long)str_val);
+ data->as_dynamic.len = len;
ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 46439e3bcec4..b33c3861fbbb 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1470,8 +1470,10 @@ register_snapshot_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
- if (tracing_alloc_snapshot_instance(file->tr) != 0)
- return 0;
+ int ret = tracing_alloc_snapshot_instance(file->tr);
+
+ if (ret < 0)
+ return ret;
return register_trigger(glob, data, file);
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 52f8b537dd0a..c4c6e0e0068b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -487,8 +487,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
return -EINVAL;
if (within_notrace_func(tk)) {
- pr_warn("Could not probe notrace function %s\n",
- trace_kprobe_symbol(tk));
+ pr_warn("Could not probe notrace function %ps\n",
+ (void *)trace_kprobe_address(tk));
return -EINVAL;
}
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index bd0d01d00fb9..a8e28f9b9271 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2444,6 +2444,9 @@ static int timerlat_fd_open(struct inode *inode, struct file *file)
tlat = this_cpu_tmr_var();
tlat->count = 0;
+ hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
+ tlat->timer.function = timerlat_irq;
+
migrate_enable();
return 0;
};
@@ -2526,9 +2529,6 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
tlat->tracing_thread = false;
tlat->kthread = current;
- hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- tlat->timer.function = timerlat_irq;
-
/* Annotate now to drift new period */
tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 3e7fa44dc2b2..d8b302d01083 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1587,12 +1587,11 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
- int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
seq_print_ip_sym(s, field->ip, flags);
- trace_seq_printf(s, ": %.*s", max, field->buf);
+ trace_seq_printf(s, ": %s", field->buf);
return trace_handle_return(s);
}
@@ -1601,11 +1600,10 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct print_entry *field;
- int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
- trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf);
+ trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
return trace_handle_return(&iter->seq);
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 4dc74d73fc1d..34289f9c6707 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1159,9 +1159,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
if (!(ctx->flags & TPARG_FL_TEVENT) &&
(strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 ||
strncmp(arg, "\\\"", 2) == 0)) {
- /* The type of $comm must be "string", and not an array. */
- if (parg->count || (t && strcmp(t, "string")))
+ /* The type of $comm must be "string", and not an array type. */
+ if (parg->count || (t && strcmp(t, "string"))) {
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ NEED_STRING_TYPE);
goto out;
+ }
parg->type = find_fetch_type("string", ctx->flags);
} else
parg->type = find_fetch_type(t, ctx->flags);
@@ -1169,18 +1172,6 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE);
goto out;
}
- parg->offset = *size;
- *size += parg->type->size * (parg->count ?: 1);
-
- ret = -ENOMEM;
- if (parg->count) {
- len = strlen(parg->type->fmttype) + 6;
- parg->fmt = kmalloc(len, GFP_KERNEL);
- if (!parg->fmt)
- goto out;
- snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
- parg->count);
- }
code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
if (!code)
@@ -1204,6 +1195,19 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
goto fail;
}
}
+ parg->offset = *size;
+ *size += parg->type->size * (parg->count ?: 1);
+
+ if (parg->count) {
+ len = strlen(parg->type->fmttype) + 6;
+ parg->fmt = kmalloc(len, GFP_KERNEL);
+ if (!parg->fmt) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
+ parg->count);
+ }
ret = -EINVAL;
/* Store operation */
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 850d9ecb6765..c1877d018269 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -515,7 +515,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \
C(NO_BTF_FIELD, "This field is not found."), \
C(BAD_BTF_TID, "Failed to get BTF type info."),\
- C(BAD_TYPE4STR, "This type does not fit for string."),
+ C(BAD_TYPE4STR, "This type does not fit for string."),\
+ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 7be97229ddf8..c158d65a8a88 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -13,9 +13,6 @@
* trace_seq_init() more than once to reset the trace_seq to start
* from scratch.
*
- * The buffer size is currently PAGE_SIZE, although it may become dynamic
- * in the future.
- *
* A write to the buffer will either succeed or fail. That is, unlike
* sprintf() there will not be a partial write (well it may write into
* the buffer but it wont update the pointers). This allows users to
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index c774e560f2f9..a4dcf0f24352 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
}
memcpy(elt->key, key, map->key_size);
- entry->val = elt;
+ /*
+ * Ensure the initialization is visible and
+ * publish the elt.
+ */
+ smp_wmb();
+ WRITE_ONCE(entry->val, elt);
atomic64_inc(&map->hits);
return entry->val;