From 1e1dcd93b468901e114f279c94a0b356adc5e7cd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 6 Apr 2016 18:43:24 -0700 Subject: perf: split perf_trace_buf_prepare into alloc and update parts split allows to move expensive update of 'struct trace_entry' to later phase. Repurpose unused 1st argument of perf_tp_event() to indicate event type. While splitting use temp variable 'rctx' instead of '*rctx' to avoid unnecessary loads done by the compiler due to -fno-strict-aliasing Signed-off-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Signed-off-by: David S. Miller --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index de24fbce5277..d8512883c0a0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6987,7 +6987,7 @@ static int perf_tp_event_match(struct perf_event *event, return 1; } -void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, +void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) { @@ -6999,9 +6999,11 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, .data = record, }; - perf_sample_data_init(&data, addr, 0); + perf_sample_data_init(&data, 0, 0); data.raw = &raw; + perf_trace_buf_update(record, event_type); + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); -- cgit v1.2.3-70-g09d2 From 98b5c2c65c2951772a8fc661f50d675e450e8bce Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 6 Apr 2016 18:43:25 -0700 Subject: perf, bpf: allow bpf programs attach to tracepoints introduce BPF_PROG_TYPE_TRACEPOINT program type and allow it to be attached to the perf tracepoint handler, which will copy the arguments into the per-cpu buffer and pass it to the bpf program as its first argument. The layout of the fields can be discovered by doing 'cat /sys/kernel/debug/tracing/events/sched/sched_switch/format' prior to the compilation of the program with exception that first 8 bytes are reserved and not accessible to the program. This area is used to store the pointer to 'struct pt_regs' which some of the bpf helpers will use: +---------+ | 8 bytes | hidden 'struct pt_regs *' (inaccessible to bpf program) +---------+ | N bytes | static tracepoint fields defined in tracepoint/format (bpf readonly) +---------+ | dynamic | __dynamic_array bytes of tracepoint (inaccessible to bpf yet) +---------+ Not that all of the fields are already dumped to user space via perf ring buffer and broken application access it directly without consulting tracepoint/format. Same rule applies here: static tracepoint fields should only be accessed in a format defined in tracepoint/format. The order of fields and field sizes are not an ABI. Signed-off-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Signed-off-by: David S. Miller --- include/trace/perf.h | 10 +++++++++- include/uapi/linux/bpf.h | 1 + kernel/events/core.c | 13 +++++++++---- 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel/events') diff --git a/include/trace/perf.h b/include/trace/perf.h index 77cd9043b7e4..a182306eefd7 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -34,6 +34,7 @@ perf_trace_##call(void *__data, proto) \ struct trace_event_call *event_call = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ struct trace_event_raw_##call *entry; \ + struct bpf_prog *prog = event_call->prog; \ struct pt_regs *__regs; \ u64 __count = 1; \ struct task_struct *__task = NULL; \ @@ -45,7 +46,7 @@ perf_trace_##call(void *__data, proto) \ __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \ \ head = this_cpu_ptr(event_call->perf_events); \ - if (__builtin_constant_p(!__task) && !__task && \ + if (!prog && __builtin_constant_p(!__task) && !__task && \ hlist_empty(head)) \ return; \ \ @@ -63,6 +64,13 @@ perf_trace_##call(void *__data, proto) \ \ { assign; } \ \ + if (prog) { \ + *(struct pt_regs **)entry = __regs; \ + if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \ + perf_swevent_put_recursion_context(rctx); \ + return; \ + } \ + } \ perf_trace_buf_submit(entry, __entry_size, rctx, \ event_call->event.type, __count, __regs, \ head, __task); \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 23917bb47bf3..70eda5aeb304 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -92,6 +92,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_KPROBE, BPF_PROG_TYPE_SCHED_CLS, BPF_PROG_TYPE_SCHED_ACT, + BPF_PROG_TYPE_TRACEPOINT, }; #define BPF_PSEUDO_MAP_FD 1 diff --git a/kernel/events/core.c b/kernel/events/core.c index d8512883c0a0..e5ffe97d6166 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6725,12 +6725,13 @@ int perf_swevent_get_recursion_context(void) } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -inline void perf_swevent_put_recursion_context(int rctx) +void perf_swevent_put_recursion_context(int rctx) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); put_recursion_context(swhash->recursion, rctx); } +EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { @@ -7106,6 +7107,7 @@ static void perf_event_free_filter(struct perf_event *event) static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { + bool is_kprobe, is_tracepoint; struct bpf_prog *prog; if (event->attr.type != PERF_TYPE_TRACEPOINT) @@ -7114,15 +7116,18 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) if (event->tp_event->prog) return -EEXIST; - if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) - /* bpf programs can only be attached to u/kprobes */ + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; + if (!is_kprobe && !is_tracepoint) + /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; prog = bpf_prog_get(prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->type != BPF_PROG_TYPE_KPROBE) { + if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || + (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { /* valid fd, but invalid bpf program type */ bpf_prog_put(prog); return -EINVAL; -- cgit v1.2.3-70-g09d2 From 32bbe0078afe86a8bf4c67c6b3477781b15e94dc Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 6 Apr 2016 18:43:28 -0700 Subject: bpf: sanitize bpf tracepoint access during bpf program loading remember the last byte of ctx access and at the time of attaching the program to tracepoint check that the program doesn't access bytes beyond defined in tracepoint fields This also disallows access to __dynamic_array fields, but can be relaxed in the future. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/linux/trace_events.h | 1 + kernel/bpf/verifier.c | 6 +++++- kernel/events/core.c | 8 ++++++++ kernel/trace/trace_events.c | 18 ++++++++++++++++++ 5 files changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 198f6ace70ec..b2365a6eba3d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -131,6 +131,7 @@ struct bpf_prog_type_list { struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; + u32 max_ctx_offset; const struct bpf_verifier_ops *ops; struct bpf_map **used_maps; struct bpf_prog *prog; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 56f795e6a093..fe6441203b59 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -569,6 +569,7 @@ extern int trace_define_field(struct trace_event_call *call, const char *type, int is_signed, int filter_type); extern int trace_add_event_call(struct trace_event_call *call); extern int trace_remove_event_call(struct trace_event_call *call); +extern int trace_event_get_offsets(struct trace_event_call *call); #define is_signed_type(type) (((type)(-1)) < (type)1) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2e08f8e9b771..58792fed5678 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, enum bpf_access_type t) { if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t)) + env->prog->aux->ops->is_valid_access(off, size, t)) { + /* remember the offset of last byte accessed in ctx */ + if (env->prog->aux->max_ctx_offset < off + size) + env->prog->aux->max_ctx_offset = off + size; return 0; + } verbose("invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; diff --git a/kernel/events/core.c b/kernel/events/core.c index e5ffe97d6166..9a01019ff7c8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7133,6 +7133,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } + if (is_tracepoint) { + int off = trace_event_get_offsets(event->tp_event); + + if (prog->aux->max_ctx_offset > off) { + bpf_prog_put(prog); + return -EACCES; + } + } event->tp_event->prog = prog; return 0; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 05ddc0820771..ced963049e0a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call) } } +/* + * run-time version of trace_event_get_offsets_() that returns the last + * accessible offset of trace fields excluding __dynamic_array bytes + */ +int trace_event_get_offsets(struct trace_event_call *call) +{ + struct ftrace_event_field *tail; + struct list_head *head; + + head = trace_get_fields(call); + /* + * head->next points to the last field with the largest offset, + * since it was added last by trace_define_field() + */ + tail = list_first_entry(head, struct ftrace_event_field, link); + return tail->offset + tail->size; +} + int trace_event_raw_init(struct trace_event_call *call) { int id; -- cgit v1.2.3-70-g09d2 From 85b67bcb7e4a23ced05e7020bf5843b9857f6881 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 18 Apr 2016 20:11:50 -0700 Subject: perf, bpf: minimize the size of perf_trace_() tracepoint handler move trace_call_bpf() into helper function to minimize the size of perf_trace_*() tracepoint handlers. text data bss dec hex filename 10541679 5526646 2945024 19013349 1221ee5 vmlinux_before 10509422 5526646 2945024 18981092 121a0e4 vmlinux_after It may seem that perf_fetch_caller_regs() can also be moved, but that is incorrect, since ip/sp will be wrong. bpf+tracepoint performance is not affected, since perf_swevent_put_recursion_context() is now inlined. export_symbol_gpl can also be dropped. No measurable change in normal perf tracepoints. Suggested-by: Steven Rostedt Signed-off-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt Signed-off-by: David S. Miller --- include/linux/trace_events.h | 5 +++++ include/trace/perf.h | 13 +++---------- kernel/events/core.c | 20 +++++++++++++++++++- 3 files changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel/events') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index fe6441203b59..222f6aa0418f 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -609,6 +609,11 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); +void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, + struct trace_event_call *call, u64 count, + struct pt_regs *regs, struct hlist_head *head, + struct task_struct *task); + static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type, u64 count, struct pt_regs *regs, void *head, diff --git a/include/trace/perf.h b/include/trace/perf.h index a182306eefd7..88de5c205e86 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -64,16 +64,9 @@ perf_trace_##call(void *__data, proto) \ \ { assign; } \ \ - if (prog) { \ - *(struct pt_regs **)entry = __regs; \ - if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \ - perf_swevent_put_recursion_context(rctx); \ - return; \ - } \ - } \ - perf_trace_buf_submit(entry, __entry_size, rctx, \ - event_call->event.type, __count, __regs, \ - head, __task); \ + perf_trace_run_bpf_submit(entry, __entry_size, rctx, \ + event_call, __count, __regs, \ + head, __task); \ } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 5056abffef27..9eb23dc27462 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6741,7 +6741,6 @@ void perf_swevent_put_recursion_context(int rctx) put_recursion_context(swhash->recursion, rctx); } -EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { @@ -6998,6 +6997,25 @@ static int perf_tp_event_match(struct perf_event *event, return 1; } +void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, + struct trace_event_call *call, u64 count, + struct pt_regs *regs, struct hlist_head *head, + struct task_struct *task) +{ + struct bpf_prog *prog = call->prog; + + if (prog) { + *(struct pt_regs **)raw_data = regs; + if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + } + perf_tp_event(call->event.type, count, raw_data, size, regs, head, + rctx, task); +} +EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); + void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) -- cgit v1.2.3-70-g09d2