summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--tools/perf/util/bpf-filter.c288
-rw-r--r--tools/perf/util/bpf_skel/sample-filter.h11
-rw-r--r--tools/perf/util/bpf_skel/sample_filter.bpf.c42
-rw-r--r--tools/perf/util/bpf_skel/vmlinux/vmlinux.h5
4 files changed, 304 insertions, 42 deletions
diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c
index c5eb0b7eec19..0a1832564dd2 100644
--- a/tools/perf/util/bpf-filter.c
+++ b/tools/perf/util/bpf-filter.c
@@ -1,4 +1,45 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * Generic event filter for sampling events in BPF.
+ *
+ * The BPF program is fixed and just to read filter expressions in the 'filters'
+ * map and compare the sample data in order to reject samples that don't match.
+ * Each filter expression contains a sample flag (term) to compare, an operation
+ * (==, >=, and so on) and a value.
+ *
+ * Note that each entry has an array of filter expressions and it only succeeds
+ * when all of the expressions are satisfied. But it supports the logical OR
+ * using a GROUP operation which is satisfied when any of its member expression
+ * is evaluated to true. But it doesn't allow nested GROUP operations for now.
+ *
+ * To support non-root users, the filters map can be loaded and pinned in the BPF
+ * filesystem by root (perf record --setup-filter pin). Then each user will get
+ * a new entry in the shared filters map to fill the filter expressions. And the
+ * BPF program will find the filter using (task-id, event-id) as a key.
+ *
+ * The pinned BPF object (shared for regular users) has:
+ *
+ * event_hash |
+ * | | |
+ * event->id ---> | id | ---+ idx_hash | filters
+ * | | | | | | | |
+ * | .... | +-> | idx | --+--> | exprs | ---> perf_bpf_filter_entry[]
+ * | | | | | | .op
+ * task id (tgid) --------------+ | .... | | | ... | .term (+ part)
+ * | .value
+ * |
+ * ======= (root would skip this part) ======== (compares it in a loop)
+ *
+ * This is used for per-task use cases while system-wide profiling (normally from
+ * root user) uses a separate copy of the program and the maps for its own so that
+ * it can proceed even if a lot of non-root users are using the filters at the
+ * same time. In this case the filters map has a single entry and no need to use
+ * the hash maps to get the index (key) of the filters map (IOW it's always 0).
+ *
+ * The BPF program returns 1 to accept the sample or 0 to drop it.
+ * The 'dropped' map is to keep how many samples it dropped by the filter and
+ * it will be reported as lost samples.
+ */
#include <stdlib.h>
#include <fcntl.h>
#include <sys/ioctl.h>
@@ -6,6 +47,7 @@
#include <bpf/bpf.h>
#include <linux/err.h>
+#include <linux/list.h>
#include <api/fs/fs.h>
#include <internal/xyarray.h>
#include <perf/threadmap.h>
@@ -27,7 +69,14 @@
#define PERF_SAMPLE_TYPE(_st, opt) __PERF_SAMPLE_TYPE(PBF_TERM_##_st, PERF_SAMPLE_##_st, opt)
/* Index in the pinned 'filters' map. Should be released after use. */
-static int pinned_filter_idx = -1;
+struct pinned_filter_idx {
+ struct list_head list;
+ struct evsel *evsel;
+ u64 event_id;
+ int hash_idx;
+};
+
+static LIST_HEAD(pinned_filters);
static const struct perf_sample_info {
enum perf_bpf_filter_term type;
@@ -175,24 +224,145 @@ static int convert_to_tgid(int tid)
return tgid;
}
-static int update_pid_hash(struct evsel *evsel, struct perf_bpf_filter_entry *entry)
+/*
+ * The event might be closed already so we cannot get the list of ids using FD
+ * like in create_event_hash() below, let's iterate the event_hash map and
+ * delete all entries that have the event id as a key.
+ */
+static void destroy_event_hash(u64 event_id)
+{
+ int fd;
+ u64 key, *prev_key = NULL;
+ int num = 0, alloced = 32;
+ u64 *ids = calloc(alloced, sizeof(*ids));
+
+ if (ids == NULL)
+ return;
+
+ fd = get_pinned_fd("event_hash");
+ if (fd < 0) {
+ pr_debug("cannot get fd for 'event_hash' map\n");
+ free(ids);
+ return;
+ }
+
+ /* Iterate the whole map to collect keys for the event id. */
+ while (!bpf_map_get_next_key(fd, prev_key, &key)) {
+ u64 id;
+
+ if (bpf_map_lookup_elem(fd, &key, &id) == 0 && id == event_id) {
+ if (num == alloced) {
+ void *tmp;
+
+ alloced *= 2;
+ tmp = realloc(ids, alloced * sizeof(*ids));
+ if (tmp == NULL)
+ break;
+
+ ids = tmp;
+ }
+ ids[num++] = key;
+ }
+
+ prev_key = &key;
+ }
+
+ for (int i = 0; i < num; i++)
+ bpf_map_delete_elem(fd, &ids[i]);
+
+ free(ids);
+ close(fd);
+}
+
+/*
+ * Return a representative id if ok, or 0 for failures.
+ *
+ * The perf_event->id is good for this, but an evsel would have multiple
+ * instances for CPUs and tasks. So pick up the first id and setup a hash
+ * from id of each instance to the representative id (the first one).
+ */
+static u64 create_event_hash(struct evsel *evsel)
+{
+ int x, y, fd;
+ u64 the_id = 0, id;
+
+ fd = get_pinned_fd("event_hash");
+ if (fd < 0) {
+ pr_err("cannot get fd for 'event_hash' map\n");
+ return 0;
+ }
+
+ for (x = 0; x < xyarray__max_x(evsel->core.fd); x++) {
+ for (y = 0; y < xyarray__max_y(evsel->core.fd); y++) {
+ int ret = ioctl(FD(evsel, x, y), PERF_EVENT_IOC_ID, &id);
+
+ if (ret < 0) {
+ pr_err("Failed to get the event id\n");
+ if (the_id)
+ destroy_event_hash(the_id);
+ return 0;
+ }
+
+ if (the_id == 0)
+ the_id = id;
+
+ bpf_map_update_elem(fd, &id, &the_id, BPF_ANY);
+ }
+ }
+
+ close(fd);
+ return the_id;
+}
+
+static void destroy_idx_hash(struct pinned_filter_idx *pfi)
+{
+ int fd, nr;
+ struct perf_thread_map *threads;
+
+ fd = get_pinned_fd("filters");
+ bpf_map_delete_elem(fd, &pfi->hash_idx);
+ close(fd);
+
+ if (pfi->event_id)
+ destroy_event_hash(pfi->event_id);
+
+ threads = perf_evsel__threads(&pfi->evsel->core);
+ if (threads == NULL)
+ return;
+
+ fd = get_pinned_fd("idx_hash");
+ nr = perf_thread_map__nr(threads);
+ for (int i = 0; i < nr; i++) {
+ /* The target task might be dead already, just try the pid */
+ struct idx_hash_key key = {
+ .evt_id = pfi->event_id,
+ .tgid = perf_thread_map__pid(threads, i),
+ };
+
+ bpf_map_delete_elem(fd, &key);
+ }
+ close(fd);
+}
+
+/* Maintain a hashmap from (tgid, event-id) to filter index */
+static int create_idx_hash(struct evsel *evsel, struct perf_bpf_filter_entry *entry)
{
int filter_idx;
int fd, nr, last;
+ u64 event_id = 0;
+ struct pinned_filter_idx *pfi = NULL;
struct perf_thread_map *threads;
fd = get_pinned_fd("filters");
if (fd < 0) {
- pr_debug("cannot get fd for 'filters' map\n");
+ pr_err("cannot get fd for 'filters' map\n");
return fd;
}
/* Find the first available entry in the filters map */
for (filter_idx = 0; filter_idx < MAX_FILTERS; filter_idx++) {
- if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0) {
- pinned_filter_idx = filter_idx;
+ if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0)
break;
- }
}
close(fd);
@@ -201,22 +371,44 @@ static int update_pid_hash(struct evsel *evsel, struct perf_bpf_filter_entry *en
return -EBUSY;
}
+ pfi = zalloc(sizeof(*pfi));
+ if (pfi == NULL) {
+ pr_err("Cannot save pinned filter index\n");
+ goto err;
+ }
+
+ pfi->evsel = evsel;
+ pfi->hash_idx = filter_idx;
+
+ event_id = create_event_hash(evsel);
+ if (event_id == 0) {
+ pr_err("Cannot update the event hash\n");
+ goto err;
+ }
+
+ pfi->event_id = event_id;
+
threads = perf_evsel__threads(&evsel->core);
if (threads == NULL) {
pr_err("Cannot get the thread list of the event\n");
- return -EINVAL;
+ goto err;
}
/* save the index to a hash map */
- fd = get_pinned_fd("pid_hash");
- if (fd < 0)
- return fd;
+ fd = get_pinned_fd("idx_hash");
+ if (fd < 0) {
+ pr_err("cannot get fd for 'idx_hash' map\n");
+ goto err;
+ }
last = -1;
nr = perf_thread_map__nr(threads);
for (int i = 0; i < nr; i++) {
int pid = perf_thread_map__pid(threads, i);
int tgid;
+ struct idx_hash_key key = {
+ .evt_id = event_id,
+ };
/* it actually needs tgid, let's get tgid from /proc. */
tgid = convert_to_tgid(pid);
@@ -228,16 +420,25 @@ static int update_pid_hash(struct evsel *evsel, struct perf_bpf_filter_entry *en
if (tgid == last)
continue;
last = tgid;
+ key.tgid = tgid;
- if (bpf_map_update_elem(fd, &tgid, &filter_idx, BPF_ANY) < 0) {
- pr_err("Failed to update the pid hash\n");
+ if (bpf_map_update_elem(fd, &key, &filter_idx, BPF_ANY) < 0) {
+ pr_err("Failed to update the idx_hash\n");
close(fd);
- return -1;
+ goto err;
}
- pr_debug("pid hash: %d -> %d\n", tgid, filter_idx);
+ pr_debug("bpf-filter: idx_hash (task=%d,%s) -> %d\n",
+ tgid, evsel__name(evsel), filter_idx);
}
+
+ list_add(&pfi->list, &pinned_filters);
close(fd);
- return 0;
+ return filter_idx;
+
+err:
+ destroy_idx_hash(pfi);
+ free(pfi);
+ return -1;
}
int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
@@ -247,7 +448,7 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
struct bpf_program *prog;
struct bpf_link *link;
struct perf_bpf_filter_entry *entry;
- bool needs_pid_hash = !target__has_cpu(target) && !target->uid_str;
+ bool needs_idx_hash = !target__has_cpu(target) && !target->uid_str;
entry = calloc(MAX_FILTERS, sizeof(*entry));
if (entry == NULL)
@@ -259,11 +460,11 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
goto err;
}
- if (needs_pid_hash && geteuid() != 0) {
+ if (needs_idx_hash && geteuid() != 0) {
int zero = 0;
/* The filters map is shared among other processes */
- ret = update_pid_hash(evsel, entry);
+ ret = create_idx_hash(evsel, entry);
if (ret < 0)
goto err;
@@ -274,7 +475,7 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
}
/* Reset the lost count */
- bpf_map_update_elem(fd, &pinned_filter_idx, &zero, BPF_ANY);
+ bpf_map_update_elem(fd, &ret, &zero, BPF_ANY);
close(fd);
fd = get_pinned_fd("perf_sample_filter");
@@ -288,6 +489,7 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
ret = ioctl(FD(evsel, x, y), PERF_EVENT_IOC_SET_BPF, fd);
if (ret < 0) {
pr_err("Failed to attach perf sample-filter\n");
+ close(fd);
goto err;
}
}
@@ -332,6 +534,15 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target)
err:
free(entry);
+ if (!list_empty(&pinned_filters)) {
+ struct pinned_filter_idx *pfi, *tmp;
+
+ list_for_each_entry_safe(pfi, tmp, &pinned_filters, list) {
+ destroy_idx_hash(pfi);
+ list_del(&pfi->list);
+ free(pfi);
+ }
+ }
sample_filter_bpf__destroy(skel);
return ret;
}
@@ -339,6 +550,7 @@ err:
int perf_bpf_filter__destroy(struct evsel *evsel)
{
struct perf_bpf_filter_expr *expr, *tmp;
+ struct pinned_filter_idx *pfi, *pos;
list_for_each_entry_safe(expr, tmp, &evsel->bpf_filters, list) {
list_del(&expr->list);
@@ -346,14 +558,11 @@ int perf_bpf_filter__destroy(struct evsel *evsel)
}
sample_filter_bpf__destroy(evsel->bpf_skel);
- if (pinned_filter_idx >= 0) {
- int fd = get_pinned_fd("filters");
-
- bpf_map_delete_elem(fd, &pinned_filter_idx);
- pinned_filter_idx = -1;
- close(fd);
+ list_for_each_entry_safe(pfi, pos, &pinned_filters, list) {
+ destroy_idx_hash(pfi);
+ list_del(&pfi->list);
+ free(pfi);
}
-
return 0;
}
@@ -364,10 +573,20 @@ u64 perf_bpf_filter__lost_count(struct evsel *evsel)
if (list_empty(&evsel->bpf_filters))
return 0;
- if (pinned_filter_idx >= 0) {
+ if (!list_empty(&pinned_filters)) {
int fd = get_pinned_fd("dropped");
+ struct pinned_filter_idx *pfi;
+
+ if (fd < 0)
+ return 0;
- bpf_map_lookup_elem(fd, &pinned_filter_idx, &count);
+ list_for_each_entry(pfi, &pinned_filters, list) {
+ if (pfi->evsel != evsel)
+ continue;
+
+ bpf_map_lookup_elem(fd, &pfi->hash_idx, &count);
+ break;
+ }
close(fd);
} else if (evsel->bpf_skel) {
struct sample_filter_bpf *skel = evsel->bpf_skel;
@@ -429,9 +648,10 @@ int perf_bpf_filter__pin(void)
/* pinned program will use pid-hash */
bpf_map__set_max_entries(skel->maps.filters, MAX_FILTERS);
- bpf_map__set_max_entries(skel->maps.pid_hash, MAX_PIDS);
+ bpf_map__set_max_entries(skel->maps.event_hash, MAX_EVT_HASH);
+ bpf_map__set_max_entries(skel->maps.idx_hash, MAX_IDX_HASH);
bpf_map__set_max_entries(skel->maps.dropped, MAX_FILTERS);
- skel->rodata->use_pid_hash = 1;
+ skel->rodata->use_idx_hash = 1;
if (sample_filter_bpf__load(skel) < 0) {
ret = -errno;
@@ -484,8 +704,12 @@ int perf_bpf_filter__pin(void)
pr_debug("chmod for filters failed\n");
ret = -errno;
}
- if (fchmodat(dir_fd, "pid_hash", 0666, 0) < 0) {
- pr_debug("chmod for pid_hash failed\n");
+ if (fchmodat(dir_fd, "event_hash", 0666, 0) < 0) {
+ pr_debug("chmod for event_hash failed\n");
+ ret = -errno;
+ }
+ if (fchmodat(dir_fd, "idx_hash", 0666, 0) < 0) {
+ pr_debug("chmod for idx_hash failed\n");
ret = -errno;
}
if (fchmodat(dir_fd, "dropped", 0666, 0) < 0) {
diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
index e666bfd5fbdd..5f0c8e4e83d3 100644
--- a/tools/perf/util/bpf_skel/sample-filter.h
+++ b/tools/perf/util/bpf_skel/sample-filter.h
@@ -1,8 +1,9 @@
#ifndef PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H
#define PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H
-#define MAX_FILTERS 64
-#define MAX_PIDS (16 * 1024)
+#define MAX_FILTERS 64
+#define MAX_IDX_HASH (16 * 1024)
+#define MAX_EVT_HASH (1024 * 1024)
/* supported filter operations */
enum perf_bpf_filter_op {
@@ -62,4 +63,10 @@ struct perf_bpf_filter_entry {
__u64 value;
};
+struct idx_hash_key {
+ __u64 evt_id;
+ __u32 tgid;
+ __u32 reserved;
+};
+
#endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
index 4c75354b84fd..4872a16eedfd 100644
--- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
+++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
@@ -15,13 +15,25 @@ struct filters {
__uint(max_entries, 1);
} filters SEC(".maps");
-/* tgid to filter index */
-struct pid_hash {
+/*
+ * An evsel has multiple instances for each CPU or task but we need a single
+ * id to be used as a key for the idx_hash. This hashmap would translate the
+ * instance's ID to a representative ID.
+ */
+struct event_hash {
__uint(type, BPF_MAP_TYPE_HASH);
- __type(key, int);
+ __type(key, __u64);
+ __type(value, __u64);
+ __uint(max_entries, 1);
+} event_hash SEC(".maps");
+
+/* tgid/evtid to filter index */
+struct idx_hash {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct idx_hash_key);
__type(value, int);
__uint(max_entries, 1);
-} pid_hash SEC(".maps");
+} idx_hash SEC(".maps");
/* tgid to filter index */
struct lost_count {
@@ -31,7 +43,7 @@ struct lost_count {
__uint(max_entries, 1);
} dropped SEC(".maps");
-volatile const int use_pid_hash;
+volatile const int use_idx_hash;
void *bpf_cast_to_kern_ctx(void *) __ksym;
@@ -202,11 +214,25 @@ int perf_sample_filter(void *ctx)
k = 0;
- if (use_pid_hash) {
- int tgid = bpf_get_current_pid_tgid() >> 32;
+ if (use_idx_hash) {
+ struct idx_hash_key key = {
+ .tgid = bpf_get_current_pid_tgid() >> 32,
+ };
+ __u64 eid = kctx->event->id;
+ __u64 *key_id;
int *idx;
- idx = bpf_map_lookup_elem(&pid_hash, &tgid);
+ /* get primary_event_id */
+ if (kctx->event->parent)
+ eid = kctx->event->parent->id;
+
+ key_id = bpf_map_lookup_elem(&event_hash, &eid);
+ if (key_id == NULL)
+ goto drop;
+
+ key.evt_id = *key_id;
+
+ idx = bpf_map_lookup_elem(&idx_hash, &key);
if (idx)
k = *idx;
else
diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
index d818e30c5457..4fa21468487e 100644
--- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
+++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
@@ -175,6 +175,11 @@ struct perf_sample_data {
u64 code_page_size;
} __attribute__((__aligned__(64))) __attribute__((preserve_access_index));
+struct perf_event {
+ struct perf_event *parent;
+ u64 id;
+} __attribute__((preserve_access_index));
+
struct bpf_perf_event_data_kern {
struct perf_sample_data *data;
struct perf_event *event;