From e32540b1e4b37fd720b59f8a504d7592fc3483bf Mon Sep 17 00:00:00 2001 From: Li Chen Date: Mon, 19 Aug 2024 14:01:53 +0800 Subject: ftrace: Use this_cpu_ptr() instead of per_cpu_ptr(smp_processor_id()) Use this_cpu_ptr() instead of open coding the equivalent in various ftrace functions. Cc: Mathieu Desnoyers Cc: Masami Hiramatsu Link: https://lore.kernel.org/87y14t6ofi.wl-me@linux.beauty Signed-off-by: Li Chen Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 3b0cea37e029..65fed0bbc5c2 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -184,7 +184,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array_cpu *data; unsigned int trace_ctx; int bit; - int cpu; if (unlikely(!tr->function_enabled)) return; @@ -195,8 +194,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx(); - cpu = smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); + data = this_cpu_ptr(tr->array_buffer.data); if (!atomic_read(&data->disabled)) trace_function(tr, ip, parent_ip, trace_ctx); @@ -300,7 +298,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx; unsigned long flags; int bit; - int cpu; if (unlikely(!tr->function_enabled)) return; @@ -309,8 +306,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - cpu = smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); + data = this_cpu_ptr(tr->array_buffer.data); if (atomic_read(&data->disabled)) goto out; @@ -321,7 +317,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, * TODO: think about a solution that is better than just hoping to be * lucky. */ - last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + last_info = this_cpu_ptr(tr->last_func_repeats); if (is_repeat_check(tr, last_info, ip, parent_ip)) goto out; -- cgit v1.2.3-70-g09d2 From 2aa746ec0240dcbe70aef10f40fb1518f6dfb137 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 26 Aug 2024 10:40:49 -0700 Subject: tracing/branch-profiler: Replace deprecated strncpy with strscpy strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. Both of these fields want to be NUL-terminated as per their use in printk: F_printk("%u:%s:%s (%u)%s", __entry->line, __entry->func, __entry->file, __entry->correct, __entry->constant ? " CONSTANT" : "") Use strscpy() as it NUL-terminates the destination buffer, so it doesn't have to be done manually. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html Link: https://github.com/KSPP/linux/issues/90 Cc: linux-hardening@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Kees Cook Link: https://lore.kernel.org/20240826-strncpy-kernel-trace-trace_branch-c-v1-1-b2c14f2e9e84@google.com Signed-off-by: Justin Stitt Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_branch.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index e47fdb4c92fb..aa63548873c3 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -74,10 +74,8 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) p--; p++; - strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE); - strncpy(entry->file, p, TRACE_FILE_SIZE); - entry->func[TRACE_FUNC_SIZE] = 0; - entry->file[TRACE_FILE_SIZE] = 0; + strscpy(entry->func, f->data.func); + strscpy(entry->file, p); entry->constant = f->constant; entry->line = f->data.line; entry->correct = val == expect; -- cgit v1.2.3-70-g09d2 From 49e4154f4b16345da5e219b23ed9737a6e735bc1 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 11 Sep 2024 09:00:26 +0800 Subject: tracing: Remove TRACE_EVENT_FL_FILTERED logic After commit dcb0b5575d24 ("tracing: Remove TRACE_EVENT_FL_USE_CALL_FILTER logic"), no one's going to set the TRACE_EVENT_FL_FILTERED or change the call->filter, so remove related logic. Link: https://lore.kernel.org/20240911010026.2302849-1-zhengyejian@huaweicloud.com Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 4 ---- kernel/trace/trace.c | 44 ++++++++---------------------------- kernel/trace/trace.h | 4 ---- kernel/trace/trace_branch.c | 4 +--- kernel/trace/trace_events.c | 2 -- kernel/trace/trace_functions_graph.c | 8 ++----- kernel/trace/trace_hwlat.c | 4 +--- kernel/trace/trace_mmiotrace.c | 8 ++----- kernel/trace/trace_osnoise.c | 12 +++------- kernel/trace/trace_sched_wakeup.c | 8 ++----- 10 files changed, 20 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 42bedcddd511..f8f2e52653df 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -326,7 +326,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, void trace_event_buffer_commit(struct trace_event_buffer *fbuffer); enum { - TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, @@ -341,7 +340,6 @@ enum { /* * Event flags: - * FILTERED - The event has a filter attached * CAP_ANY - Any user can enable for perf * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file @@ -356,7 +354,6 @@ enum { * to a tracepoint yet, then it is cleared when it is. */ enum { - TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), @@ -381,7 +378,6 @@ struct trace_event_call { }; struct trace_event event; char *print_fmt; - struct event_filter *filter; /* * Static events can disappear with modules, * where as dynamic ones need their own ref count. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1c69ca1f1088..bdb776e6ceb9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -593,19 +593,6 @@ int tracing_check_open_get_tr(struct trace_array *tr) return 0; } -int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct trace_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && - !filter_match_preds(call->filter, rec)) { - __trace_event_discard_commit(buffer, event); - return 1; - } - - return 0; -} - /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check @@ -2889,7 +2876,6 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx) { - struct trace_event_call *call = &event_function; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -2902,11 +2888,9 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long entry->ip = ip; entry->parent_ip = parent_ip; - if (!call_filter_check_discard(call, entry, buffer, event)) { - if (static_branch_unlikely(&trace_function_exports_enabled)) - ftrace_exports(event, TRACE_EXPORT_FUNCTION); - __buffer_unlock_commit(buffer, event); - } + if (static_branch_unlikely(&trace_function_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_FUNCTION); + __buffer_unlock_commit(buffer, event); } #ifdef CONFIG_STACKTRACE @@ -2932,7 +2916,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { - struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; unsigned int size, nr_entries; struct ftrace_stack *fstack; @@ -2986,8 +2969,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, memcpy(&entry->caller, fstack->calls, flex_array_size(entry, caller, nr_entries)); - if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ @@ -3060,7 +3042,6 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx) { - struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; @@ -3094,8 +3075,7 @@ ftrace_trace_userstack(struct trace_array *tr, memset(&entry->caller, 0, sizeof(entry->caller)); stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); - if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); @@ -3264,7 +3244,6 @@ static void trace_printk_start_stop_comm(int enabled) */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - struct trace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_buffer *buffer; struct trace_array *tr = READ_ONCE(printk_trace); @@ -3308,10 +3287,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); - if (!call_filter_check_discard(call, entry, buffer, event)) { - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); - } + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); out: ring_buffer_nest_end(buffer); @@ -3331,7 +3308,6 @@ static int __trace_array_vprintk(struct trace_buffer *buffer, unsigned long ip, const char *fmt, va_list args) { - struct trace_event_call *call = &event_print; struct ring_buffer_event *event; int len = 0, size; struct print_entry *entry; @@ -3366,10 +3342,8 @@ __trace_array_vprintk(struct trace_buffer *buffer, entry->ip = ip; memcpy(&entry->buf, tbuffer, len + 1); - if (!call_filter_check_discard(call, entry, buffer, event)) { - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); - } + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); out: ring_buffer_nest_end(buffer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c866991b9c78..638f452eec10 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1429,10 +1429,6 @@ struct trace_subsystem_dir { int nr_events; }; -extern int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct trace_buffer *buffer, - struct ring_buffer_event *event); - void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index aa63548873c3..6d08a5523ce0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -30,7 +30,6 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { - struct trace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct trace_buffer *buffer; struct trace_array_cpu *data; @@ -80,8 +79,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) entry->line = f->data.line; entry->correct = val == expect; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); out: current->trace_recursion &= ~TRACE_BRANCH_BIT; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7266ec2a4eea..77e68efbd43e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3149,8 +3149,6 @@ static void __trace_remove_event_call(struct trace_event_call *call) { event_remove(call); trace_destroy_fields(call); - free_event_filter(call->filter); - call->filter = NULL; } static int probe_remove_event_call(struct trace_event_call *call) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a569daaac4c4..ab57ec78ca04 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -102,7 +102,6 @@ int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx) { - struct trace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ent_entry *entry; @@ -113,8 +112,7 @@ int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } @@ -223,7 +221,6 @@ void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned int trace_ctx) { - struct trace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ret_entry *entry; @@ -234,8 +231,7 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } void trace_graph_return(struct ftrace_graph_ret *trace, diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 3bd6071441ad..b65353ec2837 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -130,7 +130,6 @@ static bool hwlat_busy; static void trace_hwlat_sample(struct hwlat_sample *sample) { struct trace_array *tr = hwlat_trace; - struct trace_event_call *call = &event_hwlat; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct hwlat_entry *entry; @@ -148,8 +147,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) entry->nmi_count = sample->nmi_count; entry->count = sample->count; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* Macros to encapsulate the time capturing infrastructure */ diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 64e77b513697..ba5858866b2f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -294,7 +294,6 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { - struct trace_event_call *call = &event_mmiotrace_rw; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; @@ -310,8 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->rw = *rw; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -325,7 +323,6 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { - struct trace_event_call *call = &event_mmiotrace_map; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; @@ -341,8 +338,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->map = *map; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index a50ed23bee77..b9f96c77527d 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -499,7 +499,6 @@ static void print_osnoise_headers(struct seq_file *s) static void __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct osnoise_entry *entry; @@ -517,8 +516,7 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe entry->softirq_count = sample->softirq_count; entry->thread_count = sample->thread_count; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* @@ -578,7 +576,6 @@ static void print_timerlat_headers(struct seq_file *s) static void __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct timerlat_entry *entry; @@ -591,8 +588,7 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf entry->context = sample->context; entry->timer_latency = sample->timer_latency; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* @@ -654,7 +650,6 @@ static void timerlat_save_stack(int skip) static void __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct stack_entry *entry; @@ -668,8 +663,7 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ae2ace5e515a..d6c7f18daa15 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -378,7 +378,6 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned int trace_ctx) { - struct trace_event_call *call = &event_context_switch; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -396,8 +395,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = task_state_index(next); entry->next_cpu = task_cpu(next); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } static void @@ -406,7 +404,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned int trace_ctx) { - struct trace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -424,8 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = task_state_index(wakee); entry->next_cpu = task_cpu(wakee); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } static void notrace -- cgit v1.2.3-70-g09d2 From 4a8840af5f53f2902eba91130fae650879f18e7a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Oct 2024 12:17:19 -0700 Subject: tracepoints: Use new static branch API The old static key API is deprecated. Switch to the new one. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Alice Ryhl Link: https://lore.kernel.org/7a08dae3c5eddb14b13864923c1b58ac1f4af83c.1728414936.git.jpoimboe@kernel.org Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint-defs.h | 4 ++-- include/linux/tracepoint.h | 8 ++++---- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_events_user.c | 4 ++-- kernel/tracepoint.c | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 4dc4955f0fbf..60a6e8314d4c 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -31,7 +31,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - struct static_key key; + struct static_key_false key; struct static_call_key *static_call_key; void *static_call_tramp; void *iterator; @@ -83,7 +83,7 @@ struct bpf_raw_event_map { #ifdef CONFIG_TRACEPOINTS # define tracepoint_enabled(tp) \ - static_key_false(&(__tracepoint_##tp).key) + static_branch_unlikely(&(__tracepoint_##tp).key) #else # define tracepoint_enabled(tracepoint) false #endif diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 93a9f3070b48..2a29334bbc02 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -248,7 +248,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_RCU(name, proto, args, cond) \ static inline void trace_##name##_rcuidle(proto) \ { \ - if (static_key_false(&__tracepoint_##name.key)) \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ TP_ARGS(args), \ TP_CONDITION(cond), 1); \ @@ -274,7 +274,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - if (static_key_false(&__tracepoint_##name.key)) \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ TP_ARGS(args), \ TP_CONDITION(cond), 0); \ @@ -311,7 +311,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static inline bool \ trace_##name##_enabled(void) \ { \ - return static_key_false(&__tracepoint_##name.key); \ + return static_branch_unlikely(&__tracepoint_##name.key);\ } /* @@ -328,7 +328,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) struct tracepoint __tracepoint_##_name __used \ __section("__tracepoints") = { \ .name = __tpstrtab_##_name, \ - .key = STATIC_KEY_INIT_FALSE, \ + .key = STATIC_KEY_FALSE_INIT, \ .static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \ .static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \ .iterator = &__traceiter_##_name, \ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5f9119eb7c67..cc2924ad32a3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -822,7 +822,7 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, { struct tracepoint *tp = event->tp; - if (unlikely(atomic_read(&tp->key.enabled) > 0)) { + if (unlikely(static_key_enabled(&tp->key))) { struct tracepoint_func *probe_func_ptr; synth_probe_func_t probe_func; void *__data; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 42b0d998d103..17bcad8f79de 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1676,7 +1676,7 @@ static void update_enable_bit_for(struct user_event *user) struct tracepoint *tp = &user->tracepoint; char status = 0; - if (atomic_read(&tp->key.enabled) > 0) { + if (static_key_enabled(&tp->key)) { struct tracepoint_func *probe_func_ptr; user_event_func_t probe_func; @@ -2280,7 +2280,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) * It's possible key.enabled disables after this check, however * we don't mind if a few events are included in this condition. */ - if (likely(atomic_read(&tp->key.enabled) > 0)) { + if (likely(static_key_enabled(&tp->key))) { struct tracepoint_func *probe_func_ptr; user_event_func_t probe_func; struct iov_iter copy; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 8879da16ef4d..1e3de77ea6b3 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -358,7 +358,7 @@ static int tracepoint_add_func(struct tracepoint *tp, tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ rcu_assign_pointer(tp->funcs, tp_funcs); - static_key_enable(&tp->key); + static_branch_enable(&tp->key); break; case TP_FUNC_2: /* 1->2 */ /* Set iterator static call */ @@ -414,7 +414,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, if (tp->unregfunc && static_key_enabled(&tp->key)) tp->unregfunc(); - static_key_disable(&tp->key); + static_branch_disable(&tp->key); /* Set iterator static call */ tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ -- cgit v1.2.3-70-g09d2 From 48bcda6848232667f13b4e97588de488c83c37d4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Oct 2024 18:16:29 -0400 Subject: tracing: Remove definition of trace_*_rcuidle() The trace_*_rcuidle() variant of a tracepoint was to handle places where a tracepoint was located but RCU was not "watching". All those locations have been removed, and RCU should be watching where all tracepoints are located. We can now remove the trace_*_rcuidle() variant. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Joel Fernandes Link: https://lore.kernel.org/20241003181629.36209057@gandalf.local.home Acked-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 50 ++------------------------------------- include/trace/events/preemptirq.h | 8 ------- kernel/trace/trace_preemptirq.c | 26 +++++--------------- scripts/tags.sh | 2 -- 4 files changed, 8 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 2a29334bbc02..7e4af7b3633c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -196,67 +196,25 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DO_TRACE_CALL(name, args) __traceiter_##name(NULL, args) #endif /* CONFIG_HAVE_STATIC_CALL */ -/* - * ARCH_WANTS_NO_INSTR archs are expected to have sanitized entry and idle - * code that disallow any/all tracing/instrumentation when RCU isn't watching. - */ -#ifdef CONFIG_ARCH_WANTS_NO_INSTR -#define RCUIDLE_COND(rcuidle) (rcuidle) -#else -/* srcu can't be used from NMI */ -#define RCUIDLE_COND(rcuidle) (rcuidle && in_nmi()) -#endif - /* * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. */ -#define __DO_TRACE(name, args, cond, rcuidle) \ +#define __DO_TRACE(name, args, cond) \ do { \ int __maybe_unused __idx = 0; \ \ if (!(cond)) \ return; \ \ - if (WARN_ONCE(RCUIDLE_COND(rcuidle), \ - "Bad RCU usage for tracepoint")) \ - return; \ - \ /* keep srcu and sched-rcu usage consistent */ \ preempt_disable_notrace(); \ \ - /* \ - * For rcuidle callers, use srcu since sched-rcu \ - * doesn't work from the idle path. \ - */ \ - if (rcuidle) { \ - __idx = srcu_read_lock_notrace(&tracepoint_srcu);\ - ct_irq_enter_irqson(); \ - } \ - \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ \ - if (rcuidle) { \ - ct_irq_exit_irqson(); \ - srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\ - } \ - \ preempt_enable_notrace(); \ } while (0) -#ifndef MODULE -#define __DECLARE_TRACE_RCU(name, proto, args, cond) \ - static inline void trace_##name##_rcuidle(proto) \ - { \ - if (static_branch_unlikely(&__tracepoint_##name.key)) \ - __DO_TRACE(name, \ - TP_ARGS(args), \ - TP_CONDITION(cond), 1); \ - } -#else -#define __DECLARE_TRACE_RCU(name, proto, args, cond) -#endif - /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the @@ -277,14 +235,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (static_branch_unlikely(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ TP_ARGS(args), \ - TP_CONDITION(cond), 0); \ + TP_CONDITION(cond)); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ } \ } \ - __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ - PARAMS(cond)) \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ { \ @@ -375,8 +331,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ static inline void trace_##name(proto) \ { } \ - static inline void trace_##name##_rcuidle(proto) \ - { } \ static inline int \ register_trace_##name(void (*probe)(data_proto), \ void *data) \ diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h index 3f249e150c0c..f99562d2b496 100644 --- a/include/trace/events/preemptirq.h +++ b/include/trace/events/preemptirq.h @@ -43,8 +43,6 @@ DEFINE_EVENT(preemptirq_template, irq_enable, #else #define trace_irq_enable(...) #define trace_irq_disable(...) -#define trace_irq_enable_rcuidle(...) -#define trace_irq_disable_rcuidle(...) #endif #ifdef CONFIG_TRACE_PREEMPT_TOGGLE @@ -58,8 +56,6 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #else #define trace_preempt_enable(...) #define trace_preempt_disable(...) -#define trace_preempt_enable_rcuidle(...) -#define trace_preempt_disable_rcuidle(...) #endif #endif /* _TRACE_PREEMPTIRQ_H */ @@ -69,10 +65,6 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #else /* !CONFIG_PREEMPTIRQ_TRACEPOINTS */ #define trace_irq_enable(...) #define trace_irq_disable(...) -#define trace_irq_enable_rcuidle(...) -#define trace_irq_disable_rcuidle(...) #define trace_preempt_enable(...) #define trace_preempt_disable(...) -#define trace_preempt_enable_rcuidle(...) -#define trace_preempt_disable_rcuidle(...) #endif diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index e37446f7916e..5c03633316a6 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -15,20 +15,6 @@ #define CREATE_TRACE_POINTS #include -/* - * Use regular trace points on architectures that implement noinstr - * tooling: these calls will only happen with RCU enabled, which can - * use a regular tracepoint. - * - * On older architectures, use the rcuidle tracing methods (which - * aren't NMI-safe - so exclude NMI contexts): - */ -#ifdef CONFIG_ARCH_WANTS_NO_INSTR -#define trace(point) trace_##point -#else -#define trace(point) if (!in_nmi()) trace_##point##_rcuidle -#endif - #ifdef CONFIG_TRACE_IRQFLAGS /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); @@ -42,7 +28,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on_prepare(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -53,7 +39,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -75,7 +61,7 @@ void trace_hardirqs_off_finish(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); } } @@ -89,7 +75,7 @@ void trace_hardirqs_off(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); } } EXPORT_SYMBOL(trace_hardirqs_off); @@ -100,13 +86,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off); void trace_preempt_on(unsigned long a0, unsigned long a1) { - trace(preempt_enable)(a0, a1); + trace_preempt_enable(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - trace(preempt_disable)(a0, a1); + trace_preempt_disable(a0, a1); tracer_preempt_off(a0, a1); } #endif diff --git a/scripts/tags.sh b/scripts/tags.sh index 191e0461d6d5..0d01c1cafb70 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -152,9 +152,7 @@ regex_c=( '/^BPF_CALL_[0-9]([[:space:]]*\([[:alnum:]_]*\).*/\1/' '/^COMPAT_SYSCALL_DEFINE[0-9]([[:space:]]*\([[:alnum:]_]*\).*/compat_sys_\1/' '/^TRACE_EVENT([[:space:]]*\([[:alnum:]_]*\).*/trace_\1/' - '/^TRACE_EVENT([[:space:]]*\([[:alnum:]_]*\).*/trace_\1_rcuidle/' '/^DEFINE_EVENT([^,)]*,[[:space:]]*\([[:alnum:]_]*\).*/trace_\1/' - '/^DEFINE_EVENT([^,)]*,[[:space:]]*\([[:alnum:]_]*\).*/trace_\1_rcuidle/' '/^DEFINE_INSN_CACHE_OPS([[:space:]]*\([[:alnum:]_]*\).*/get_\1_slot/' '/^DEFINE_INSN_CACHE_OPS([[:space:]]*\([[:alnum:]_]*\).*/free_\1_slot/' '/^PAGEFLAG([[:space:]]*\([[:alnum:]_]*\).*/Page\1/' -- cgit v1.2.3-70-g09d2 From e53244e2c8931f9e80c1841293aea86ef8ad32a3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Oct 2024 18:42:20 -0400 Subject: tracepoint: Remove SRCU protection With the removal of the trace_*_rcuidle() tracepoints, there is no reason to protect tracepoints with SRCU. The reason the SRCU protection was added, was because it can protect tracepoints when RCU is not "watching". Now that tracepoints are only used when RCU is watching, remove the SRCU protection. It just made things more complex and confusing anyway. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Joel Fernandes Link: https://lore.kernel.org/20241003184220.0dc21d35@gandalf.local.home Acked-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 4 ---- kernel/tracepoint.c | 51 +--------------------------------------------- 2 files changed, 1 insertion(+), 54 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 7e4af7b3633c..3d33b9872cec 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -32,8 +32,6 @@ struct trace_eval_map { #define TRACEPOINT_DEFAULT_PRIO 10 -extern struct srcu_struct tracepoint_srcu; - extern int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data); extern int @@ -109,7 +107,6 @@ void for_each_tracepoint_in_module(struct module *mod, #ifdef CONFIG_TRACEPOINTS static inline void tracepoint_synchronize_unregister(void) { - synchronize_srcu(&tracepoint_srcu); synchronize_rcu(); } #else @@ -207,7 +204,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (!(cond)) \ return; \ \ - /* keep srcu and sched-rcu usage consistent */ \ preempt_disable_notrace(); \ \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1e3de77ea6b3..6474e2cf22c9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,9 +25,6 @@ enum tp_func_state { extern tracepoint_ptr_t __start___tracepoints_ptrs[]; extern tracepoint_ptr_t __stop___tracepoints_ptrs[]; -DEFINE_SRCU(tracepoint_srcu); -EXPORT_SYMBOL_GPL(tracepoint_srcu); - enum tp_transition_sync { TP_TRANSITION_SYNC_1_0_1, TP_TRANSITION_SYNC_N_2_1, @@ -37,7 +34,6 @@ enum tp_transition_sync { struct tp_transition_snapshot { unsigned long rcu; - unsigned long srcu; bool ongoing; }; @@ -50,7 +46,6 @@ static void tp_rcu_get_state(enum tp_transition_sync sync) /* Keep the latest get_state snapshot. */ snapshot->rcu = get_state_synchronize_rcu(); - snapshot->srcu = start_poll_synchronize_srcu(&tracepoint_srcu); snapshot->ongoing = true; } @@ -61,8 +56,6 @@ static void tp_rcu_cond_sync(enum tp_transition_sync sync) if (!snapshot->ongoing) return; cond_synchronize_rcu(snapshot->rcu); - if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu)) - synchronize_srcu(&tracepoint_srcu); snapshot->ongoing = false; } @@ -85,9 +78,6 @@ static LIST_HEAD(tracepoint_module_list); */ static DEFINE_MUTEX(tracepoints_mutex); -static struct rcu_head *early_probes; -static bool ok_to_free_tracepoints; - /* * Note about RCU : * It is used to delay the free of multiple probes array until a quiescent @@ -111,56 +101,17 @@ static inline void *allocate_probes(int count) return p == NULL ? NULL : p->probes; } -static void srcu_free_old_probes(struct rcu_head *head) -{ - kfree(container_of(head, struct tp_probes, rcu)); -} - static void rcu_free_old_probes(struct rcu_head *head) { - call_srcu(&tracepoint_srcu, head, srcu_free_old_probes); -} - -static __init int release_early_probes(void) -{ - struct rcu_head *tmp; - - ok_to_free_tracepoints = true; - - while (early_probes) { - tmp = early_probes; - early_probes = tmp->next; - call_rcu(tmp, rcu_free_old_probes); - } - - return 0; + kfree(container_of(head, struct tp_probes, rcu)); } -/* SRCU is initialized at core_initcall */ -postcore_initcall(release_early_probes); - static inline void release_probes(struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); - /* - * We can't free probes if SRCU is not initialized yet. - * Postpone the freeing till after SRCU is initialized. - */ - if (unlikely(!ok_to_free_tracepoints)) { - tp_probes->rcu.next = early_probes; - early_probes = &tp_probes->rcu; - return; - } - - /* - * Tracepoint probes are protected by both sched RCU and SRCU, - * by calling the SRCU callback in the sched RCU callback we - * cover both cases. So let us chain the SRCU and sched RCU - * callbacks to wait for both grace periods. - */ call_rcu(&tp_probes->rcu, rcu_free_old_probes); } } -- cgit v1.2.3-70-g09d2 From 13d750c2c03e9861e15268574ed2c239cca9c9d5 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:12 -0400 Subject: tracing/ftrace: disable preemption in syscall probe In preparation for allowing system call enter/exit instrumentation to handle page faults, make sure that ftrace can handle this change by explicitly disabling preemption within the ftrace system call tracepoint probes to respect the current expectations within ftrace ring buffer code. This change does not yet allow ftrace to take page faults per se within its probe, but allows its existing probes to adapt to the upcoming change. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-3-mathieu.desnoyers@efficios.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/trace_events.h | 39 ++++++++++++++++++++++++++++++++------- kernel/trace/trace_syscalls.c | 12 ++++++++++++ 2 files changed, 44 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 8bcbb9ee44de..63071aa5923d 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -263,6 +263,9 @@ static struct trace_event_fields trace_event_fields_##call[] = { \ tstruct \ {} }; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) @@ -396,11 +399,11 @@ static inline notrace int trace_event_get_offsets_##call( \ #include "stages/stage6_event_callback.h" -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ - \ + +#undef __DECLARE_EVENT_CLASS +#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace void \ -trace_event_raw_event_##call(void *__data, proto) \ +do_trace_event_raw_event_##call(void *__data, proto) \ { \ struct trace_event_file *trace_file = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ @@ -425,15 +428,35 @@ trace_event_raw_event_##call(void *__data, proto) \ \ trace_event_buffer_commit(&fbuffer); \ } + +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +trace_event_raw_event_##call(void *__data, proto) \ +{ \ + do_trace_event_raw_event_##call(__data, args); \ +} + +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +trace_event_raw_event_##call(void *__data, proto) \ +{ \ + preempt_disable_notrace(); \ + do_trace_event_raw_event_##call(__data, args); \ + preempt_enable_notrace(); \ +} + /* * The ftrace_test_probe is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the ftrace probe will * fail to compile unless it too is updated. */ -#undef DECLARE_EVENT_SYSCALL_CLASS -#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS - #undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ static inline void ftrace_test_probe_##call(void) \ @@ -443,6 +466,8 @@ static inline void ftrace_test_probe_##call(void) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#undef __DECLARE_EVENT_CLASS + #include "stages/stage7_class_define.h" #undef DECLARE_EVENT_CLASS diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 785733245ead..f9b21bac9d45 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -299,6 +299,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int syscall_nr; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -338,6 +344,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct trace_event_buffer fbuffer; int syscall_nr; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; -- cgit v1.2.3-70-g09d2 From 65e7462a16cea593025ca3b34c5d74e69b027ee0 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:13 -0400 Subject: tracing/perf: disable preemption in syscall probe In preparation for allowing system call enter/exit instrumentation to handle page faults, make sure that perf can handle this change by explicitly disabling preemption within the perf system call tracepoint probes to respect the current expectations within perf ring buffer code. This change does not yet allow perf to take page faults per se within its probe, but allows its existing probes to adapt to the upcoming change. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-4-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/perf.h | 42 ++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace_syscalls.c | 12 ++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/trace/perf.h b/include/trace/perf.h index ded997af481e..15cde7eac8b4 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -12,10 +12,10 @@ #undef __perf_task #define __perf_task(t) (__task = (t)) -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +#undef __DECLARE_EVENT_CLASS +#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace void \ -perf_trace_##call(void *__data, proto) \ +do_perf_trace_##call(void *__data, proto) \ { \ struct trace_event_call *event_call = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ @@ -55,8 +55,39 @@ perf_trace_##call(void *__data, proto) \ head, __task); \ } +/* + * Define unused __count and __task variables to use @args to pass + * arguments to do_perf_trace_##call. This is needed because the + * macros __perf_count and __perf_task introduce the side-effect to + * store copies into those local variables. + */ +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +perf_trace_##call(void *__data, proto) \ +{ \ + u64 __count __attribute__((unused)); \ + struct task_struct *__task __attribute__((unused)); \ + \ + do_perf_trace_##call(__data, args); \ +} + #undef DECLARE_EVENT_SYSCALL_CLASS -#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +perf_trace_##call(void *__data, proto) \ +{ \ + u64 __count __attribute__((unused)); \ + struct task_struct *__task __attribute__((unused)); \ + \ + preempt_disable_notrace(); \ + do_perf_trace_##call(__data, args); \ + preempt_enable_notrace(); \ +} /* * This part is compiled out, it is only here as a build time check @@ -76,4 +107,7 @@ static inline void perf_test_probe_##call(void) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef __DECLARE_EVENT_CLASS + #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f9b21bac9d45..b1cc19806f3d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -596,6 +596,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int rctx; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -698,6 +704,12 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int rctx; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; -- cgit v1.2.3-70-g09d2 From a3204c740a59bebb3b37a294d83f4e353303a52c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:16 -0400 Subject: tracing/ftrace: Add might_fault check to syscall probes Add a might_fault() check to validate that the ftrace sys_enter/sys_exit probe callbacks are indeed called from a context where page faults can be handled. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-7-mathieu.desnoyers@efficios.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/trace_events.h | 1 + kernel/trace/trace_syscalls.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 63071aa5923d..4f22136fd465 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -446,6 +446,7 @@ __DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ static notrace void \ trace_event_raw_event_##call(void *__data, proto) \ { \ + might_fault(); \ preempt_disable_notrace(); \ do_trace_event_raw_event_##call(__data, args); \ preempt_enable_notrace(); \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b1cc19806f3d..6d6bbd56ed92 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -303,6 +303,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); @@ -348,6 +349,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); -- cgit v1.2.3-70-g09d2 From cdb537ac417938408ee819992f432c410f2d01a2 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:17 -0400 Subject: tracing/perf: Add might_fault check to syscall probes Add a might_fault() check to validate that the perf sys_enter/sys_exit probe callbacks are indeed called from a context where page faults can be handled. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-8-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/perf.h | 1 + kernel/trace/trace_syscalls.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/include/trace/perf.h b/include/trace/perf.h index 15cde7eac8b4..a1754b73a8f5 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -84,6 +84,7 @@ perf_trace_##call(void *__data, proto) \ u64 __count __attribute__((unused)); \ struct task_struct *__task __attribute__((unused)); \ \ + might_fault(); \ preempt_disable_notrace(); \ do_perf_trace_##call(__data, args); \ preempt_enable_notrace(); \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 6d6bbd56ed92..46aab0ab9350 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -602,6 +602,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); @@ -710,6 +711,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); -- cgit v1.2.3-70-g09d2 From afe5960dc208fe069ddaaeb0994d857b24ac19d1 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Fri, 13 Sep 2024 03:13:47 +0100 Subject: trace/trace_event_perf: remove duplicate samples on the first tracepoint event When a tracepoint event is created with attr.freq = 1, 'hwc->period_left' is not initialized correctly. As a result, in the perf_swevent_overflow() function, when the first time the event occurs, it calculates the event overflow and the perf_swevent_set_period() returns 3, this leads to the event are recorded for three duplicate times. Step to reproduce: 1. Enable the tracepoint event & starting tracing $ echo 1 > /sys/kernel/tracing/events/module/module_free $ echo 1 > /sys/kernel/tracing/tracing_on 2. Record with perf $ perf record -a --strict-freq -F 1 -e "module:module_free" 3. Trigger module_free event. $ modprobe -i sunrpc $ modprobe -r sunrpc Result: - Trace pipe result: $ cat trace_pipe modprobe-174509 [003] ..... 6504.868896: module_free: sunrpc - perf sample: modprobe 174509 [003] 6504.868980: module:module_free: sunrpc modprobe 174509 [003] 6504.868980: module:module_free: sunrpc modprobe 174509 [003] 6504.868980: module:module_free: sunrpc By setting period_left via perf_swevent_set_period() as other sw_event did, This problem could be solved. After patch: - Trace pipe result: $ cat trace_pipe modprobe 1153096 [068] 613468.867774: module:module_free: xfs - perf sample modprobe 1153096 [068] 613468.867794: module:module_free: xfs Link: https://lore.kernel.org/20240913021347.595330-1-yeoreum.yun@arm.com Fixes: bd2b5b12849a ("perf_counter: More aggressive frequency adjustment") Signed-off-by: Levi Yun Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_event_perf.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 05e791241812..3ff9caa4a71b 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -352,10 +352,16 @@ void perf_uprobe_destroy(struct perf_event *p_event) int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; + struct hw_perf_event *hwc = &p_event->hw; if (!(flags & PERF_EF_START)) p_event->hw.state = PERF_HES_STOPPED; + if (is_sampling_event(p_event)) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(p_event); + } + /* * If TRACE_REG_PERF_ADD returns false; no custom action was performed * and we need to take the default action of enqueueing our event on -- cgit v1.2.3-70-g09d2 From eb887c4567d1b0e7684c026fe7df44afa96589e6 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 7 Oct 2024 10:56:28 +0200 Subject: tracing: Use atomic64_inc_return() in trace_clock_counter() Use atomic64_inc_return(&ref) instead of atomic64_add_return(1, &ref) to use optimized implementation and ease register pressure around the primitive for targets that implement optimized variant. Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241007085651.48544-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 4702efb00ff2..4cb2ebc439be 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -154,5 +154,5 @@ static atomic64_t trace_counter; */ u64 notrace trace_clock_counter(void) { - return atomic64_add_return(1, &trace_counter); + return atomic64_inc_return(&trace_counter); } -- cgit v1.2.3-70-g09d2 From 2c33155ef678033b8a3105b824cdef930f05b47d Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 21 Oct 2024 15:18:31 +0100 Subject: tracing: Make percpu stack trace buffer invariant to PAGE_SIZE Previously the size of "struct ftrace_stacks" depended upon PAGE_SIZE. For the common 4K page size, on a 64-bit system, sizeof(struct ftrace_stacks) was 32K. But for a 64K page size, sizeof(struct ftrace_stacks) was 512K. But ftrace stack usage requirements should be invariant to page size. So let's redefine FTRACE_KSTACK_ENTRIES so that "struct ftrace_stacks" is always sized at 32K for 64-bit and 16K for 32-bit. As a side effect, it removes the PAGE_SIZE compile-time constant assumption from this code, which is required to reach the goal of boot-time page size selection. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241021141832.3668264-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bdb776e6ceb9..f1d613d924e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2898,7 +2898,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long /* Allow 4 levels of nesting: normal, softirq, irq, NMI */ #define FTRACE_KSTACK_NESTING 4 -#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING) +#define FTRACE_KSTACK_ENTRIES (SZ_4K / FTRACE_KSTACK_NESTING) struct ftrace_stack { unsigned long calls[FTRACE_KSTACK_ENTRIES]; -- cgit v1.2.3-70-g09d2 From 77a1326f64c3245ae9d2f9297abec5c8a0f11f58 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 14 Oct 2024 14:13:14 -0700 Subject: tracing: Replace multiple deprecated strncpy with memcpy strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. String copy operations involving manual pointer offset and length calculations followed by explicit NUL-byte assignments are best changed to either strscpy or memcpy. strscpy is not a drop-in replacement as @len would need a one subtracted from it to avoid truncating the source string. To not sabotage readability of the current code, use memcpy (retaining the manual NUL assignment) as this unambiguously describes the desired behavior. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://github.com/KSPP/linux/issues/90 [2] Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/20241014-strncpy-kernel-trace-trace_events_filter-c-v2-1-d821e81e371e@google.com Reviewed-by: Kees Cook Signed-off-by: Justin Stitt Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0c611b281a5b..78051de581e7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1616,7 +1616,7 @@ static int parse_pred(const char *str, void *data, goto err_free; } - strncpy(num_buf, str + s, len); + memcpy(num_buf, str + s, len); num_buf[len] = 0; ret = kstrtoul(num_buf, 0, &ip); @@ -1694,7 +1694,7 @@ static int parse_pred(const char *str, void *data, if (!pred->regex) goto err_mem; pred->regex->len = len; - strncpy(pred->regex->pattern, str + s, len); + memcpy(pred->regex->pattern, str + s, len); pred->regex->pattern[len] = 0; } else if (!strncmp(str + i, "CPUS", 4)) { @@ -1859,7 +1859,7 @@ static int parse_pred(const char *str, void *data, if (!pred->regex) goto err_mem; pred->regex->len = len; - strncpy(pred->regex->pattern, str + s, len); + memcpy(pred->regex->pattern, str + s, len); pred->regex->pattern[len] = 0; filter_build_regex(pred); @@ -1919,7 +1919,7 @@ static int parse_pred(const char *str, void *data, goto err_free; } - strncpy(num_buf, str + s, len); + memcpy(num_buf, str + s, len); num_buf[len] = 0; /* Make sure it is a value */ -- cgit v1.2.3-70-g09d2 From e9f0a363473585a0f51b1b61a9bb15a63808d6ea Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Oct 2024 13:01:12 +0200 Subject: tracing: Remove TRACE_FLAG_IRQS_NOSUPPORT It was possible to enable tracing with no IRQ tracing support. The tracing infrastructure would then record TRACE_FLAG_IRQS_NOSUPPORT as the only tracing flag and show an 'X' in the output. The last user of this feature was PPC32 which managed to implement it during PowerPC merge in 2009. Since then, it was unused and the PPC32 dependency was finally removed in commit 0ea5ee035133a ("tracing: Remove PPC32 wart from config TRACING_SUPPORT"). Since the PowerPC merge the code behind !CONFIG_TRACE_IRQFLAGS_SUPPORT with TRACING enabled can no longer be selected used and the 'X' is not displayed or recorded. Remove the CONFIG_TRACE_IRQFLAGS_SUPPORT from the tracing code. Remove TRACE_FLAG_IRQS_NOSUPPORT. Cc: Peter Zijlstra Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241022110112.XJI8I9T2@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/ftrace.rst | 3 --- include/linux/trace_events.h | 13 ------------- kernel/trace/trace_output.c | 1 - 3 files changed, 17 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 4073ca48af4a..74d5bd801b1a 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -1031,9 +1031,6 @@ explains which is which. CPU#: The CPU which the process was running on. irqs-off: 'd' interrupts are disabled. '.' otherwise. - .. caution:: If the architecture does not support a way to - read the irq flags variable, an 'X' will always - be printed here. need-resched: - 'N' both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED is set, diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index f8f2e52653df..016b29a56c87 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -184,7 +184,6 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_IRQS_NOSUPPORT = 0x02, TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, @@ -193,7 +192,6 @@ enum trace_flag_type { TRACE_FLAG_BH_OFF = 0x80, }; -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) { unsigned int irq_status = irqs_disabled_flags(irqflags) ? @@ -207,17 +205,6 @@ static inline unsigned int tracing_gen_ctx(void) local_save_flags(irqflags); return tracing_gen_ctx_flags(irqflags); } -#else - -static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) -{ - return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); -} -static inline unsigned int tracing_gen_ctx(void) -{ - return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); -} -#endif static inline unsigned int tracing_gen_ctx_dec(void) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 868f2f912f28..2ee6613dce6d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -460,7 +460,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : bh_off ? 'b' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | -- cgit v1.2.3-70-g09d2 From a9cfb8778c43fc473ae16cddb6e9611705721b31 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 31 Oct 2024 11:20:53 -0400 Subject: tracing: Introduce tracepoint extended structure Shrink the struct tracepoint size from 80 bytes to 72 bytes on x86-64 by moving the (typically NULL) regfunc/unregfunc pointers to an extended structure. Tested-by: Jordan Rife Cc: Michael Jeanson Cc: Thomas Gleixner Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Cc: linux-trace-kernel@vger.kernel.org Link: https://lore.kernel.org/20241031152056.744137-2-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint-defs.h | 8 ++++++-- include/linux/tracepoint.h | 19 +++++++++++++------ kernel/tracepoint.c | 9 ++++----- 3 files changed, 23 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 60a6e8314d4c..967c08d9da84 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -29,6 +29,11 @@ struct tracepoint_func { int prio; }; +struct tracepoint_ext { + int (*regfunc)(void); + void (*unregfunc)(void); +}; + struct tracepoint { const char *name; /* Tracepoint name */ struct static_key_false key; @@ -36,9 +41,8 @@ struct tracepoint { void *static_call_tramp; void *iterator; void *probestub; - int (*regfunc)(void); - void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; + struct tracepoint_ext *ext; }; #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 0dc67fad706c..862ab49177a4 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -302,7 +302,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * structures, so we create an array of pointers that will be used for iteration * on the tracepoints. */ -#define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args) \ +#define __DEFINE_TRACE_EXT(_name, _ext, proto, args) \ static const char __tpstrtab_##_name[] \ __section("__tracepoints_strings") = #_name; \ extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \ @@ -316,9 +316,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) .static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \ .iterator = &__traceiter_##_name, \ .probestub = &__probestub_##_name, \ - .regfunc = _reg, \ - .unregfunc = _unreg, \ - .funcs = NULL }; \ + .funcs = NULL, \ + .ext = _ext, \ + }; \ __TRACEPOINT_ENTRY(_name); \ int __traceiter_##_name(void *__data, proto) \ { \ @@ -341,8 +341,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } \ DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name); -#define DEFINE_TRACE(name, proto, args) \ - DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args)); +#define DEFINE_TRACE_FN(_name, _reg, _unreg, _proto, _args) \ + static struct tracepoint_ext __tracepoint_ext_##_name = { \ + .regfunc = _reg, \ + .unregfunc = _unreg, \ + }; \ + __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args)); + +#define DEFINE_TRACE(_name, _proto, _args) \ + __DEFINE_TRACE_EXT(_name, NULL, PARAMS(_proto), PARAMS(_args)); #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ EXPORT_SYMBOL_GPL(__tracepoint_##name); \ diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 6474e2cf22c9..5658dc92f5b5 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -278,8 +278,8 @@ static int tracepoint_add_func(struct tracepoint *tp, struct tracepoint_func *old, *tp_funcs; int ret; - if (tp->regfunc && !static_key_enabled(&tp->key)) { - ret = tp->regfunc(); + if (tp->ext && tp->ext->regfunc && !static_key_enabled(&tp->key)) { + ret = tp->ext->regfunc(); if (ret < 0) return ret; } @@ -362,9 +362,8 @@ static int tracepoint_remove_func(struct tracepoint *tp, switch (nr_func_state(tp_funcs)) { case TP_FUNC_0: /* 1->0 */ /* Removed last function */ - if (tp->unregfunc && static_key_enabled(&tp->key)) - tp->unregfunc(); - + if (tp->ext && tp->ext->unregfunc && static_key_enabled(&tp->key)) + tp->ext->unregfunc(); static_branch_disable(&tp->key); /* Set iterator static call */ tracepoint_update_call(tp, tp_funcs); -- cgit v1.2.3-70-g09d2 From 2e8a12b82c40466204a832cf1a3ae9e9080710dc Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 31 Oct 2024 11:20:55 -0400 Subject: tracing: Fix syscall tracepoint use-after-free The grace period used internally within tracepoint.c:release_probes() uses call_rcu() to batch waiting for quiescence of old probe arrays, rather than using the tracepoint_synchronize_unregister() which blocks while waiting for quiescence. With the introduction of faultable syscall tracepoints, this causes use-after-free issues reproduced with syzkaller. Fix this by using the appropriate call_rcu() or call_rcu_tasks_trace() before invoking the rcu_free_old_probes callback. This can be chosen using the tracepoint_is_faultable() API. A similar issue exists in bpf use of call_rcu(). Fixing this is left to a separate change. Reported-by: syzbot+b390c8062d8387b6272a@syzkaller.appspotmail.com Fixes: a363d27cdbc2 ("tracing: Allow system call tracepoints to handle page faults") Tested-by: Jordan Rife Cc: Michael Jeanson Cc: Thomas Gleixner Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Cc: linux-trace-kernel@vger.kernel.org Link: https://lore.kernel.org/20241031152056.744137-4-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- kernel/tracepoint.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 5658dc92f5b5..1848ce7e2976 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -106,13 +106,16 @@ static void rcu_free_old_probes(struct rcu_head *head) kfree(container_of(head, struct tp_probes, rcu)); } -static inline void release_probes(struct tracepoint_func *old) +static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); - call_rcu(&tp_probes->rcu, rcu_free_old_probes); + if (tracepoint_is_faultable(tp)) + call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes); + else + call_rcu(&tp_probes->rcu, rcu_free_old_probes); } } @@ -334,7 +337,7 @@ static int tracepoint_add_func(struct tracepoint *tp, break; } - release_probes(old); + release_probes(tp, old); return 0; } @@ -405,7 +408,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, WARN_ON_ONCE(1); break; } - release_probes(old); + release_probes(tp, old); return 0; } -- cgit v1.2.3-70-g09d2 From 242b32d8073ed16868ff0f9381732e9782dea63b Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Thu, 31 Oct 2024 20:01:39 +0800 Subject: tracing: Replace strncpy() with strscpy() when copying comm Replace the depreciated[1] strncpy() calls with strscpy() when copying comm. Link: https://github.com/KSPP/linux/issues/90 [1] Cc: Cc: Link: https://lore.kernel.org/20241031120139.1343025-1-ruanjinjie@huawei.com Signed-off-by: Jinjie Ruan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_events_hist.c | 4 ++-- kernel/trace/trace_sched_switch.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f1d613d924e9..a587fd7d7447 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1921,7 +1921,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + strscpy(max_data->comm, tsk->comm); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index cc2924ad32a3..c288b92fc4df 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1599,7 +1599,7 @@ static inline void save_comm(char *comm, struct task_struct *task) return; } - strncpy(comm, task->comm, TASK_COMM_LEN); + strscpy(comm, task->comm, TASK_COMM_LEN); } static void hist_elt_data_free(struct hist_elt_data *elt_data) @@ -3405,7 +3405,7 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) elt_data = context->elt->private_data; track_elt_data = track_data->elt.private_data; if (elt_data->comm) - strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); + strscpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); track_data->updated = true; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8a407adb0e1c..573b5d8e8a28 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -187,7 +187,7 @@ static inline char *get_saved_cmdlines(int idx) static inline void set_cmdline(int idx, const char *cmdline) { - strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); + strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) -- cgit v1.2.3-70-g09d2 From f44ec8733a8469143fde1984b5e6931b2e2f6f3f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 1 Nov 2024 11:17:52 -0700 Subject: bpf: put bpf_link's program when link is safe to be deallocated In general, BPF link's underlying BPF program should be considered to be reachable through attach hook -> link -> prog chain, and, pessimistically, we have to assume that as long as link's memory is not safe to free, attach hook's code might hold a pointer to BPF program and use it. As such, it's not (generally) correct to put link's program early before waiting for RCU GPs to go through. More eager bpf_prog_put() that we currently do is mostly correct due to BPF program's release code doing similar RCU GP waiting, but as will be shown in the following patches, BPF program can be non-sleepable (and, thus, reliant on only "classic" RCU GP), while BPF link's attach hook can have sleepable semantics and needs to be protected by RCU Tasks Trace, and for such cases BPF link has to go through RCU Tasks Trace + "classic" RCU GPs before being deallocated. And so, if we put BPF program early, we might free BPF program before we free BPF link, leading to use-after-free situation. So, this patch defers bpf_prog_put() until we are ready to perform bpf_link's deallocation. At worst, this delays BPF program freeing by one extra RCU GP, but that seems completely acceptable. Alternatively, we'd need more elaborate ways to determine BPF hook, BPF link, and BPF program lifetimes, and how they relate to each other, which seems like an unnecessary complication. Note, for most BPF links we still will perform eager bpf_prog_put() and link dealloc, so for those BPF links there are no observable changes whatsoever. Only BPF links that use deferred dealloc might notice slightly delayed freeing of BPF programs. Also, to reduce code and logic duplication, extract program put + link dealloc logic into bpf_link_dealloc() helper. Link: https://lore.kernel.org/20241101181754.782341-1-andrii@kernel.org Tested-by: Jordan Rife Signed-off-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (Google) --- kernel/bpf/syscall.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a8f1808a1ca5..aa7246a399f3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2976,12 +2976,24 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); } +static void bpf_link_dealloc(struct bpf_link *link) +{ + /* now that we know that bpf_link itself can't be reached, put underlying BPF program */ + if (link->prog) + bpf_prog_put(link->prog); + + /* free bpf_link and its containing memory */ + if (link->ops->dealloc_deferred) + link->ops->dealloc_deferred(link); + else + link->ops->dealloc(link); +} + static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) { struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); - /* free bpf_link and its containing memory */ - link->ops->dealloc_deferred(link); + bpf_link_dealloc(link); } static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) @@ -3003,7 +3015,6 @@ static void bpf_link_free(struct bpf_link *link) sleepable = link->prog->sleepable; /* detach BPF program, clean up used resources */ ops->release(link); - bpf_prog_put(link->prog); } if (ops->dealloc_deferred) { /* schedule BPF link deallocation; if underlying BPF program @@ -3014,8 +3025,9 @@ static void bpf_link_free(struct bpf_link *link) call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); - } else if (ops->dealloc) - ops->dealloc(link); + } else if (ops->dealloc) { + bpf_link_dealloc(link); + } } static void bpf_link_put_deferred(struct work_struct *work) -- cgit v1.2.3-70-g09d2 From 61c6fefa92bb4ed7a34163b94f6ffac628237a29 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 1 Nov 2024 11:17:53 -0700 Subject: bpf: decouple BPF link/attach hook and BPF program sleepable semantics BPF link's lifecycle protection scheme depends on both BPF hook and BPF program. If *either* of those require RCU Tasks Trace GP, then we need to go through a chain of GPs before putting BPF program refcount and deallocating BPF link memory. This patch adds bpf_link-specific sleepable flag, which can be set to true even if underlying BPF program is not sleepable itself. If either link->sleepable or link->prog->sleepable is true, we'll go through a chain of RCU Tasks Trace GP and RCU GP before putting BPF program and freeing memory. This will be used to protect BPF link for sleepable (faultable) raw tracepoints in the next patch. Link: https://lore.kernel.org/20241101181754.782341-2-andrii@kernel.org Tested-by: Jordan Rife Signed-off-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (Google) --- include/linux/bpf.h | 20 ++++++++++++++++++-- kernel/bpf/syscall.c | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 46 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 19d8ca8ac960..e7236facadd4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1598,6 +1598,11 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; + /* whether BPF link itself has "sleepable" semantics, which can differ + * from underlying BPF program having a "sleepable" semantics, as BPF + * link's semantics is determined by target attach hook + */ + bool sleepable; /* rcu is used before freeing, work can be used to schedule that * RCU-based freeing before that, so they never overlap */ @@ -1614,8 +1619,10 @@ struct bpf_link_ops { */ void (*dealloc)(struct bpf_link *link); /* deallocate link resources callback, called after RCU grace period; - * if underlying BPF program is sleepable we go through tasks trace - * RCU GP and then "classic" RCU GP + * if either the underlying BPF program is sleepable or BPF link's + * target hook is sleepable, we'll go through tasks trace RCU GP and + * then "classic" RCU GP; this need for chaining tasks trace and + * classic RCU GPs is designated by setting bpf_link->sleepable flag */ void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); @@ -2362,6 +2369,9 @@ int bpf_prog_new_fd(struct bpf_prog *prog); void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog); +void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + bool sleepable); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); @@ -2717,6 +2727,12 @@ static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, { } +static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + bool sleepable) +{ +} + static inline int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index aa7246a399f3..0f5540627911 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2933,17 +2933,33 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, - const struct bpf_link_ops *ops, struct bpf_prog *prog) +/* bpf_link_init_sleepable() allows to specify whether BPF link itself has + * "sleepable" semantics, which normally would mean that BPF link's attach + * hook can dereference link or link's underlying program for some time after + * detachment due to RCU Tasks Trace-based lifetime protection scheme. + * BPF program itself can be non-sleepable, yet, because it's transitively + * reachable through BPF link, its freeing has to be delayed until after RCU + * Tasks Trace GP. + */ +void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + bool sleepable) { WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); link->type = type; + link->sleepable = sleepable; link->id = 0; link->ops = ops; link->prog = prog; } +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog) +{ + bpf_link_init_sleepable(link, type, ops, prog, false); +} + static void bpf_link_free_id(int id) { if (!id) @@ -3008,20 +3024,21 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) static void bpf_link_free(struct bpf_link *link) { const struct bpf_link_ops *ops = link->ops; - bool sleepable = false; bpf_link_free_id(link->id); - if (link->prog) { - sleepable = link->prog->sleepable; - /* detach BPF program, clean up used resources */ + /* detach BPF program, clean up used resources */ + if (link->prog) ops->release(link); - } if (ops->dealloc_deferred) { - /* schedule BPF link deallocation; if underlying BPF program - * is sleepable, we need to first wait for RCU tasks trace - * sync, then go through "classic" RCU grace period + /* Schedule BPF link deallocation, which will only then + * trigger putting BPF program refcount. + * If underlying BPF program is sleepable or BPF link's target + * attach hookpoint is sleepable or otherwise requires RCU GPs + * to ensure link and its underlying BPF program is not + * reachable anymore, we need to first wait for RCU tasks + * trace sync, and then go through "classic" RCU grace period */ - if (sleepable) + if (link->sleepable || (link->prog && link->prog->sleepable)) call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); -- cgit v1.2.3-70-g09d2 From 24507ce81eaf0c34d91f3d1acaa73ee2f796190a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 1 Nov 2024 11:17:54 -0700 Subject: bpf: ensure RCU Tasks Trace GP for sleepable raw tracepoint BPF links Now that kernel supports sleepable tracepoints, the fact that bpf_probe_unregister() is asynchronous, i.e., that it doesn't wait for any in-flight tracepoints to conclude before returning, we now need to delay BPF raw tp link's deallocation and bpf_prog_put() of its underlying BPF program (regardless of program's own sleepable semantics) until after full RCU Tasks Trace GP. With that GP over, we'll have a guarantee that no tracepoint can reach BPF link and thus its BPF program. We use newly added tracepoint_is_faultable() check to know when this RCU Tasks Trace GP is necessary and utilize BPF link's own sleepable flag passed through bpf_link_init_sleepable() initializer. Link: https://lore.kernel.org/20241101181754.782341-3-andrii@kernel.org Tested-by: Jordan Rife Reported-by: Jordan Rife Fixes: a363d27cdbc2 ("tracing: Allow system call tracepoints to handle page faults") Signed-off-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (Google) --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0f5540627911..db2a987504b2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -3845,8 +3846,9 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, - &bpf_raw_tp_link_lops, prog); + bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog, + tracepoint_is_faultable(btp->tp)); link->btp = btp; link->cookie = cookie; -- cgit v1.2.3-70-g09d2 From 6371b4bc179aad17d84ee301b409a5a8ce675657 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 7 Nov 2024 12:05:30 +0000 Subject: tracing: Remove redundant check on field->field in histograms The check on field->field being true is handled as the first check on the cascaded if statement, so the later checks on field->field are redundant because this clause has already been handled. Since this later check is redundant, just remove it. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241107120530.18728-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c288b92fc4df..9c058aa8baf3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1354,10 +1354,7 @@ static const char *hist_field_name(struct hist_field *field, } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; else if (field->flags & HIST_FIELD_FL_STACKTRACE) { - if (field->field) - field_name = field->field->name; - else - field_name = "common_stacktrace"; + field_name = "common_stacktrace"; } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; -- cgit v1.2.3-70-g09d2 From 60b1f578b5789730d81460d1836dec7fa60510bf Mon Sep 17 00:00:00 2001 From: Jeff Xie Date: Tue, 8 Oct 2024 11:31:59 +0800 Subject: ftrace: Get the true parent ip for function tracer When using both function tracer and function graph simultaneously, it is found that function tracer sometimes captures a fake parent ip (return_to_handler) instead of the true parent ip. This issue is easy to reproduce. Below are my reproduction steps: jeff-labs:~/bin # ./trace-net.sh jeff-labs:~/bin # cat /sys/kernel/debug/tracing/instances/foo/trace | grep return_to_handler trace-net.sh-405 [001] ...2. 31.859501: avc_has_perm+0x4/0x190 <-return_to_handler+0x0/0x40 trace-net.sh-405 [001] ...2. 31.859503: simple_setattr+0x4/0x70 <-return_to_handler+0x0/0x40 trace-net.sh-405 [001] ...2. 31.859503: truncate_pagecache+0x4/0x60 <-return_to_handler+0x0/0x40 trace-net.sh-405 [001] ...2. 31.859505: unmap_mapping_range+0x4/0x140 <-return_to_handler+0x0/0x40 trace-net.sh-405 [001] ...3. 31.859508: _raw_spin_unlock+0x4/0x30 <-return_to_handler+0x0/0x40 [...] The following is my simple trace script: jeff-labs:~/bin # cat ./trace-net.sh TRACE_PATH="/sys/kernel/tracing" set_events() { echo 1 > $1/events/net/enable echo 1 > $1/events/tcp/enable echo 1 > $1/events/sock/enable echo 1 > $1/events/napi/enable echo 1 > $1/events/fib/enable echo 1 > $1/events/neigh/enable } set_events ${TRACE_PATH} echo 1 > ${TRACE_PATH}/options/sym-offset echo 1 > ${TRACE_PATH}/options/funcgraph-tail echo 1 > ${TRACE_PATH}/options/funcgraph-proc echo 1 > ${TRACE_PATH}/options/funcgraph-abstime echo 'tcp_orphan*' > ${TRACE_PATH}/set_ftrace_notrace echo function_graph > ${TRACE_PATH}/current_tracer INSTANCE_FOO=${TRACE_PATH}/instances/foo if [ ! -e $INSTANCE_FOO ]; then mkdir ${INSTANCE_FOO} fi set_events ${INSTANCE_FOO} echo 1 > ${INSTANCE_FOO}/options/sym-offset echo 'tcp_orphan*' > ${INSTANCE_FOO}/set_ftrace_notrace echo function > ${INSTANCE_FOO}/current_tracer echo 1 > ${TRACE_PATH}/tracing_on echo 1 > ${INSTANCE_FOO}/tracing_on echo > ${TRACE_PATH}/trace echo > ${INSTANCE_FOO}/trace Link: https://lore.kernel.org/20241008033159.22459-1-jeff.xie@linux.dev Acked-by: Masami Hiramatsu (Google) Signed-off-by: Jeff Xie Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 65fed0bbc5c2..74c353164ca1 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -176,6 +176,27 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(&tr->array_buffer); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static __always_inline unsigned long +function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) +{ + unsigned long true_parent_ip; + int idx = 0; + + true_parent_ip = parent_ip; + if (unlikely(parent_ip == (unsigned long)&return_to_handler) && fregs) + true_parent_ip = ftrace_graph_ret_addr(current, &idx, parent_ip, + (unsigned long *)ftrace_regs_get_stack_pointer(fregs)); + return true_parent_ip; +} +#else +static __always_inline unsigned long +function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) +{ + return parent_ip; +} +#endif + static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) @@ -192,6 +213,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; + parent_ip = function_get_true_parent_ip(parent_ip, fregs); + trace_ctx = tracing_gen_ctx(); data = this_cpu_ptr(tr->array_buffer.data); @@ -239,6 +262,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, * recursive protection is performed. */ local_irq_save(flags); + parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); @@ -306,6 +330,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; + parent_ip = function_get_true_parent_ip(parent_ip, fregs); data = this_cpu_ptr(tr->array_buffer.data); if (atomic_read(&data->disabled)) goto out; @@ -352,6 +377,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, * recursive protection is performed. */ local_irq_save(flags); + parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); -- cgit v1.2.3-70-g09d2 From 6ce5a6f0a07d37cc377df08a8d8a9c283420f323 Mon Sep 17 00:00:00 2001 From: Tatsuya S Date: Mon, 21 Oct 2024 07:14:53 +0000 Subject: tracing: Fix function name for trampoline The issue that unrelated function name is shown on stack trace like following even though it should be trampoline code address is caused by the creation of trampoline code in the area where .init.text section of module was freed after module is loaded. bash-1344 [002] ..... 43.644608: => (MODULE INIT FUNCTION) => vfs_write => ksys_write => do_syscall_64 => entry_SYSCALL_64_after_hwframe To resolve this, when function address of stack trace entry is in trampoline, output without looking up symbol name. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241021071454.34610-2-tatsuya.s2862@gmail.com Signed-off-by: Tatsuya S Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 33 +++++++++++++++++++++++++-------- kernel/trace/trace.h | 7 +++++++ kernel/trace/trace_output.c | 4 ++++ 3 files changed, 36 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a587fd7d7447..566d99f9f086 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -975,7 +975,8 @@ static inline void trace_access_lock_init(void) #endif #ifdef CONFIG_STACKTRACE -static void __ftrace_trace_stack(struct trace_buffer *buffer, +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs); static inline void ftrace_trace_stack(struct trace_array *tr, @@ -984,7 +985,8 @@ static inline void ftrace_trace_stack(struct trace_array *tr, int skip, struct pt_regs *regs); #else -static inline void __ftrace_trace_stack(struct trace_buffer *buffer, +static inline void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { @@ -2912,7 +2914,8 @@ struct ftrace_stacks { static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); -static void __ftrace_trace_stack(struct trace_buffer *buffer, +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { @@ -2958,6 +2961,20 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, nr_entries = stack_trace_save(fstack->calls, size, skip); } +#ifdef CONFIG_DYNAMIC_FTRACE + /* Mark entry of stack trace as trampoline code */ + if (tr->ops && tr->ops->trampoline) { + unsigned long tramp_start = tr->ops->trampoline; + unsigned long tramp_end = tramp_start + tr->ops->trampoline_size; + unsigned long *calls = fstack->calls; + + for (int i = 0; i < nr_entries; i++) { + if (calls[i] >= tramp_start && calls[i] < tramp_end) + calls[i] = FTRACE_TRAMPOLINE_MARKER; + } + } +#endif + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, struct_size(entry, caller, nr_entries), trace_ctx); @@ -2987,7 +3004,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr, if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(buffer, trace_ctx, skip, regs); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); } void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, @@ -2996,7 +3013,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, struct trace_buffer *buffer = tr->array_buffer.buffer; if (rcu_is_watching()) { - __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); return; } @@ -3013,7 +3030,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, return; ct_irq_enter_irqson(); - __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); ct_irq_exit_irqson(); } @@ -3030,8 +3047,8 @@ void trace_dump_stack(int skip) /* Skip 1 to skip this function. */ skip++; #endif - __ftrace_trace_stack(printk_trace->array_buffer.buffer, - tracing_gen_ctx(), skip, NULL); + __ftrace_trace_stack(printk_trace, printk_trace->array_buffer.buffer, + tracing_gen_ctx(), skip, NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 638f452eec10..6e66b666c3e9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2172,4 +2172,11 @@ static inline int rv_init_interface(void) } #endif +/* + * This is used only to distinguish + * function address from trampoline code. + * So this value has no meaning. + */ +#define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX) + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 2ee6613dce6d..e08aee34ef63 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1245,6 +1245,10 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, break; trace_seq_puts(s, " => "); + if ((*p) == FTRACE_TRAMPOLINE_MARKER) { + trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n"); + continue; + } seq_print_ip_sym(s, (*p) + delta, flags); trace_seq_putc(s, '\n'); } -- cgit v1.2.3-70-g09d2 From 45af52e7d3b8560f21d139b3759735eead8b1653 Mon Sep 17 00:00:00 2001 From: guoweikang Date: Wed, 20 Nov 2024 13:27:49 +0800 Subject: ftrace: Fix regression with module command in stack_trace_filter When executing the following command: # echo "write*:mod:ext3" > /sys/kernel/tracing/stack_trace_filter The current mod command causes a null pointer dereference. While commit 0f17976568b3f ("ftrace: Fix regression with module command in stack_trace_filter") has addressed part of the issue, it left a corner case unhandled, which still results in a kernel crash. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241120052750.275463-1-guoweikang.kernel@gmail.com Fixes: 04ec7bb642b77 ("tracing: Have the trace_array hold the list of registered func probes"); Signed-off-by: guoweikang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4c28dd177ca6..5ff0822342ac 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5076,6 +5076,9 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, char *func; int ret; + if (!tr) + return -ENODEV; + /* match_records() modifies func, and we need the original */ func = kstrdup(func_orig, GFP_KERNEL); if (!func) -- cgit v1.2.3-70-g09d2