diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 226 |
1 files changed, 186 insertions, 40 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index a54f2c2cdb20..6ee1febdf6ff 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -902,6 +902,17 @@ list_update_cgroup_event(struct perf_event *event, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); + + /* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */ + if (perf_cgroup_from_task(current, ctx) != event->cgrp) { + /* + * We are removing the last cpu event in this context. + * If that event is not active in this cpu, cpuctx->cgrp + * should've been cleared by perf_cgroup_switch. + */ + WARN_ON_ONCE(!add && cpuctx->cgrp); + return; + } cpuctx->cgrp = add ? event->cgrp : NULL; } @@ -1475,8 +1486,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) { struct list_head *list; - if (is_software_event(event)) - event->group_flags |= PERF_GROUP_SOFTWARE; + event->group_caps = event->event_caps; list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); @@ -1630,9 +1640,7 @@ static void perf_group_attach(struct perf_event *event) WARN_ON_ONCE(group_leader->ctx != event->ctx); - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + group_leader->group_caps &= event->event_caps; list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; @@ -1723,7 +1731,7 @@ static void perf_group_detach(struct perf_event *event) sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ - sibling->group_flags = event->group_flags; + sibling->group_caps = event->group_caps; WARN_ON_ONCE(sibling->ctx != event->ctx); } @@ -1832,6 +1840,8 @@ group_sched_out(struct perf_event *group_event, struct perf_event *event; int state = group_event->state; + perf_pmu_disable(ctx->pmu); + event_sched_out(group_event, cpuctx, ctx); /* @@ -1840,6 +1850,8 @@ group_sched_out(struct perf_event *group_event, list_for_each_entry(event, &group_event->sibling_list, group_entry) event_sched_out(event, cpuctx, ctx); + perf_pmu_enable(ctx->pmu); + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) cpuctx->exclusive = 0; } @@ -1959,6 +1971,12 @@ void perf_event_disable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ + event->pending_disable = 1; + irq_work_queue(&event->pending); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -2145,7 +2163,7 @@ static int group_can_go_on(struct perf_event *event, /* * Groups consisting entirely of software events can always go on. */ - if (event->group_flags & PERF_GROUP_SOFTWARE) + if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware @@ -2491,7 +2509,7 @@ static int __perf_event_stop(void *info) * while restarting. */ if (sd->restart) - event->pmu->start(event, PERF_EF_START); + event->pmu->start(event, 0); return 0; } @@ -2837,19 +2855,36 @@ unlock: } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); + void perf_sched_cb_dec(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + this_cpu_dec(perf_sched_cb_usages); + + if (!--cpuctx->sched_cb_usage) + list_del(&cpuctx->sched_cb_entry); } + void perf_sched_cb_inc(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + if (!cpuctx->sched_cb_usage++) + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + this_cpu_inc(perf_sched_cb_usages); } /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. + * + * This callback is relevant even to per-cpu events; for example multi event + * PEBS requires this to provide PID/TID information. This requires we flush + * all queued PEBS records before we context switch to a new task. */ static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, @@ -2857,34 +2892,24 @@ static void perf_pmu_sched_task(struct task_struct *prev, { struct perf_cpu_context *cpuctx; struct pmu *pmu; - unsigned long flags; if (prev == next) return; - local_irq_save(flags); - - rcu_read_lock(); + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { + pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ - list_for_each_entry_rcu(pmu, &pmus, entry) { - if (pmu->sched_task) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - - perf_pmu_disable(pmu); + if (WARN_ON_ONCE(!pmu->sched_task)) + continue; - pmu->sched_task(cpuctx->task_ctx, sched_in); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); - perf_pmu_enable(pmu); + pmu->sched_task(cpuctx->task_ctx, sched_in); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } - - rcu_read_unlock(); - - local_irq_restore(flags); } static void perf_event_switch(struct task_struct *task, @@ -3416,6 +3441,22 @@ struct perf_read_data { int ret; }; +static int find_cpu_to_read(struct perf_event *event, int local_cpu) +{ + int event_cpu = event->oncpu; + u16 local_pkg, event_pkg; + + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { + event_pkg = topology_physical_package_id(event_cpu); + local_pkg = topology_physical_package_id(local_cpu); + + if (event_pkg == local_pkg) + return local_cpu; + } + + return event_cpu; +} + /* * Cross CPU call to read the hardware event */ @@ -3537,7 +3578,7 @@ u64 perf_event_read_local(struct perf_event *event) static int perf_event_read(struct perf_event *event, bool group) { - int ret = 0; + int ret = 0, cpu_to_read, local_cpu; /* * If event is enabled and currently active on a CPU, update the @@ -3549,6 +3590,11 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; + + local_cpu = get_cpu(); + cpu_to_read = find_cpu_to_read(event, local_cpu); + put_cpu(); + /* * Purposely ignore the smp_call_function_single() return * value. @@ -3559,7 +3605,7 @@ static int perf_event_read(struct perf_event *event, bool group) * Therefore, either way, we'll have an up-to-date event count * after this. */ - (void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); + (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1); ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; @@ -3929,7 +3975,7 @@ static void exclusive_event_destroy(struct perf_event *event) static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { - if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && + if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) @@ -5350,9 +5396,10 @@ perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) { int bit; + DECLARE_BITMAP(_mask, 64); - for_each_set_bit(bit, (const unsigned long *) &mask, - sizeof(mask) * BITS_PER_BYTE) { + bitmap_from_u64(_mask, mask); + for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { u64 val; val = perf_reg_value(regs, bit); @@ -7045,11 +7092,11 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); + + perf_event_disable_inatomic(event); } - event->overflow_handler(event, data, regs); + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -7664,11 +7711,83 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +#ifdef CONFIG_BPF_SYSCALL +static void bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .regs = regs, + }; + int ret = 0; + + preempt_disable(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + ret = BPF_PROG_RUN(event->prog, (void *)&ctx); + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (!ret) + return; + + event->orig_overflow_handler(event, data, regs); +} + +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + event->prog = prog; + event->orig_overflow_handler = READ_ONCE(event->overflow_handler); + WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); + return 0; +} + +static void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + return -EOPNOTSUPP; +} +static void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint; struct bpf_prog *prog; + if (event->attr.type == PERF_TYPE_HARDWARE || + event->attr.type == PERF_TYPE_SOFTWARE) + return perf_event_set_bpf_handler(event, prog_fd); + if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; @@ -7709,6 +7828,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; + perf_event_free_bpf_handler(event); + if (!event->tp_event) return; @@ -7908,6 +8029,7 @@ restart: * if <size> is not specified, the range is treated as a single address. */ enum { + IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, @@ -7931,6 +8053,7 @@ static const match_table_t if_tokens = { { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, + { IF_ACT_NONE, NULL }, }; /* @@ -8751,7 +8874,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { + int remove_device; + mutex_lock(&pmus_lock); + remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); mutex_unlock(&pmus_lock); @@ -8765,10 +8891,12 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); + if (remove_device) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); + } free_pmu_context(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -9025,6 +9153,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) + if (overflow_handler == bpf_overflow_handler) { + struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto err_ns; + } + event->prog = prog; + event->orig_overflow_handler = + parent_event->orig_overflow_handler; + } +#endif } if (overflow_handler) { @@ -9505,6 +9646,9 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if (pmu->task_ctx_nr == perf_sw_context) + event->event_caps |= PERF_EV_CAP_SOFTWARE; + if (group_leader && (is_software_event(event) != is_software_event(group_leader))) { if (is_software_event(event)) { @@ -9518,7 +9662,7 @@ SYSCALL_DEFINE5(perf_event_open, */ pmu = group_leader->pmu; } else if (is_software_event(group_leader) && - (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to @@ -10453,6 +10597,8 @@ static void __init perf_event_init_all_cpus(void) INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); + + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } |