summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-19 13:34:06 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-19 13:34:06 -0800
commitf41dac3efb7582cd3f518fadf7764d424f453788 (patch)
treeda2aee8b82625525adf671911ed862af43a1b60d
parent9d7d4ad222aea8ab482e78858d03b10221c7fe78 (diff)
parent2c47e7a74f445426d156278e339b7abb259e50de (diff)
Merge tag 'perf-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar: "Uprobes: - Add BPF session support (Jiri Olsa) - Switch to RCU Tasks Trace flavor for better performance (Andrii Nakryiko) - Massively increase uretprobe SMP scalability by SRCU-protecting the uretprobe lifetime (Andrii Nakryiko) - Kill xol_area->slot_count (Oleg Nesterov) Core facilities: - Implement targeted high-frequency profiling by adding the ability for an event to "pause" or "resume" AUX area tracing (Adrian Hunter) VM profiling/sampling: - Correct perf sampling with guest VMs (Colton Lewis) New hardware support: - x86/intel: Add PMU support for Intel ArrowLake-H CPUs (Dapeng Mi) Misc fixes and enhancements: - x86/intel/pt: Fix buffer full but size is 0 case (Adrian Hunter) - x86/amd: Warn only on new bits set (Breno Leitao) - x86/amd/uncore: Avoid a false positive warning about snprintf truncation in amd_uncore_umc_ctx_init (Jean Delvare) - uprobes: Re-order struct uprobe_task to save some space (Christophe JAILLET) - x86/rapl: Move the pmu allocation out of CPU hotplug (Kan Liang) - x86/rapl: Clean up cpumask and hotplug (Kan Liang) - uprobes: Deuglify xol_get_insn_slot/xol_free_insn_slot paths (Oleg Nesterov)" * tag 'perf-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits) perf/core: Correct perf sampling with guest VMs perf/x86: Refactor misc flag assignments perf/powerpc: Use perf_arch_instruction_pointer() perf/core: Hoist perf_instruction_pointer() and perf_misc_flags() perf/arm: Drop unused functions uprobes: Re-order struct uprobe_task to save some space perf/x86/amd/uncore: Avoid a false positive warning about snprintf truncation in amd_uncore_umc_ctx_init perf/x86/intel: Do not enable large PEBS for events with aux actions or aux sampling perf/x86/intel/pt: Add support for pause / resume perf/core: Add aux_pause, aux_resume, aux_start_paused perf/x86/intel/pt: Fix buffer full but size is 0 case uprobes: SRCU-protect uretprobe lifetime (with timeout) uprobes: allow put_uprobe() from non-sleepable softirq context perf/x86/rapl: Clean up cpumask and hotplug perf/x86/rapl: Move the pmu allocation out of CPU hotplug uprobe: Add support for session consumer uprobe: Add data pointer to consumer handlers perf/x86/amd: Warn only on new bits set uprobes: fold xol_take_insn_slot() into xol_get_insn_slot() uprobes: kill xol_area->slot_count ...
-rw-r--r--arch/Kconfig1
-rw-r--r--arch/arm/include/asm/perf_event.h7
-rw-r--r--arch/arm/kernel/perf_callchain.c17
-rw-r--r--arch/arm64/include/asm/perf_event.h4
-rw-r--r--arch/arm64/kernel/perf_callchain.c28
-rw-r--r--arch/powerpc/include/asm/perf_event_server.h6
-rw-r--r--arch/powerpc/perf/callchain.c2
-rw-r--r--arch/powerpc/perf/callchain_32.c2
-rw-r--r--arch/powerpc/perf/callchain_64.c2
-rw-r--r--arch/powerpc/perf/core-book3s.c4
-rw-r--r--arch/s390/include/asm/perf_event.h6
-rw-r--r--arch/s390/kernel/perf_event.c4
-rw-r--r--arch/x86/events/amd/core.c10
-rw-r--r--arch/x86/events/amd/uncore.c5
-rw-r--r--arch/x86/events/core.c64
-rw-r--r--arch/x86/events/intel/core.c137
-rw-r--r--arch/x86/events/intel/ds.c21
-rw-r--r--arch/x86/events/intel/pt.c84
-rw-r--r--arch/x86/events/intel/pt.h6
-rw-r--r--arch/x86/events/perf_event.h34
-rw-r--r--arch/x86/events/rapl.c130
-rw-r--r--arch/x86/include/asm/cpu.h6
-rw-r--r--arch/x86/include/asm/perf_event.h12
-rw-r--r--arch/x86/kernel/cpu/intel.c15
-rw-r--r--include/linux/cpuhotplug.h1
-rw-r--r--include/linux/perf_event.h54
-rw-r--r--include/linux/uprobes.h83
-rw-r--r--include/uapi/linux/perf_event.h11
-rw-r--r--kernel/events/core.c102
-rw-r--r--kernel/events/internal.h1
-rw-r--r--kernel/events/uprobes.c608
-rw-r--r--kernel/trace/bpf_trace.c6
-rw-r--r--kernel/trace/trace_uprobe.c12
-rw-r--r--tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c2
34 files changed, 1069 insertions, 418 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index bd9f095d69fa..b4ad3a6d29f1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -135,6 +135,7 @@ config KPROBES_ON_FTRACE
config UPROBES
def_bool n
depends on ARCH_SUPPORTS_UPROBES
+ select TASKS_TRACE_RCU
help
Uprobes is the user-space counterpart to kprobes: they
enable instrumentation applications (such as 'perf probe')
diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
index bdbc1e590891..c08f16f2e243 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -8,13 +8,6 @@
#ifndef __ARM_PERF_EVENT_H__
#define __ARM_PERF_EVENT_H__
-#ifdef CONFIG_PERF_EVENTS
-struct pt_regs;
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-#define perf_misc_flags(regs) perf_misc_flags(regs)
-#endif
-
#define perf_arch_fetch_caller_regs(regs, __ip) { \
(regs)->ARM_pc = (__ip); \
frame_pointer((regs)) = (unsigned long) __builtin_frame_address(0); \
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 1d230ac9d0eb..a2601b1ef318 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -96,20 +96,3 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
arm_get_current_stackframe(regs, &fr);
walk_stackframe(&fr, callchain_trace, entry);
}
-
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
- return instruction_pointer(regs);
-}
-
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
- int misc = 0;
-
- if (user_mode(regs))
- misc |= PERF_RECORD_MISC_USER;
- else
- misc |= PERF_RECORD_MISC_KERNEL;
-
- return misc;
-}
diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index eb7071c9eb34..ee45b4e77347 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -10,10 +10,6 @@
#include <asm/ptrace.h>
#ifdef CONFIG_PERF_EVENTS
-struct pt_regs;
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-#define perf_misc_flags(regs) perf_misc_flags(regs)
#define perf_arch_bpf_user_pt_regs(regs) &regs->user_regs
#endif
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index e8ed5673f481..9b7f26b128b5 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -38,31 +38,3 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
arch_stack_walk(callchain_trace, entry, current, regs);
}
-
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
- if (perf_guest_state())
- return perf_guest_get_ip();
-
- return instruction_pointer(regs);
-}
-
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
- unsigned int guest_state = perf_guest_state();
- int misc = 0;
-
- if (guest_state) {
- if (guest_state & PERF_GUEST_USER)
- misc |= PERF_RECORD_MISC_GUEST_USER;
- else
- misc |= PERF_RECORD_MISC_GUEST_KERNEL;
- } else {
- if (user_mode(regs))
- misc |= PERF_RECORD_MISC_USER;
- else
- misc |= PERF_RECORD_MISC_KERNEL;
- }
-
- return misc;
-}
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 5995614e9062..af0f46e2373b 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -102,8 +102,8 @@ struct power_pmu {
int __init register_power_pmu(struct power_pmu *pmu);
struct pt_regs;
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_arch_misc_flags(struct pt_regs *regs);
+extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
extern unsigned long int read_bhrb(int n);
/*
@@ -111,7 +111,7 @@ extern unsigned long int read_bhrb(int n);
* if we have hardware PMU support.
*/
#ifdef CONFIG_PPC_PERF_CTRS
-#define perf_misc_flags(regs) perf_misc_flags(regs)
+#define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs)
#endif
/*
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 6b4434dd0ff3..26aa26482c9a 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -51,7 +51,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
lr = regs->link;
sp = regs->gpr[1];
- perf_callchain_store(entry, perf_instruction_pointer(regs));
+ perf_callchain_store(entry, perf_arch_instruction_pointer(regs));
if (!validate_sp(sp, current))
return;
diff --git a/arch/powerpc/perf/callchain_32.c b/arch/powerpc/perf/callchain_32.c
index ea8cfe3806dc..ddcc2d8aa64a 100644
--- a/arch/powerpc/perf/callchain_32.c
+++ b/arch/powerpc/perf/callchain_32.c
@@ -139,7 +139,7 @@ void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
long level = 0;
unsigned int __user *fp, *uregs;
- next_ip = perf_instruction_pointer(regs);
+ next_ip = perf_arch_instruction_pointer(regs);
lr = regs->link;
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c
index 488e8a21a11e..115d1c105e8a 100644
--- a/arch/powerpc/perf/callchain_64.c
+++ b/arch/powerpc/perf/callchain_64.c
@@ -74,7 +74,7 @@ void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
struct signal_frame_64 __user *sigframe;
unsigned long __user *fp, *uregs;
- next_ip = perf_instruction_pointer(regs);
+ next_ip = perf_arch_instruction_pointer(regs);
lr = regs->link;
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 42867469752d..dc01aa604cc1 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2332,7 +2332,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
* Called from generic code to get the misc flags (i.e. processor mode)
* for an event_id.
*/
-unsigned long perf_misc_flags(struct pt_regs *regs)
+unsigned long perf_arch_misc_flags(struct pt_regs *regs)
{
u32 flags = perf_get_misc_flags(regs);
@@ -2346,7 +2346,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
* Called from generic code to get the instruction pointer
* for an event_id.
*/
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
+unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
{
unsigned long siar = mfspr(SPRN_SIAR);
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index 29ee289108c5..e53894cedf08 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -37,9 +37,9 @@ extern ssize_t cpumf_events_sysfs_show(struct device *dev,
/* Perf callbacks */
struct pt_regs;
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-#define perf_misc_flags(regs) perf_misc_flags(regs)
+extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_arch_misc_flags(struct pt_regs *regs);
+#define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs)
#define perf_arch_bpf_user_pt_regs(regs) &regs->user_regs
/* Perf pt_regs extension for sample-data-entry indicators */
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 0c65eaf099f9..2b9611c4718e 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -57,7 +57,7 @@ static unsigned long instruction_pointer_guest(struct pt_regs *regs)
return sie_block(regs)->gpsw.addr;
}
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
+unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
{
return is_in_guest(regs) ? instruction_pointer_guest(regs)
: instruction_pointer(regs);
@@ -84,7 +84,7 @@ static unsigned long perf_misc_flags_sf(struct pt_regs *regs)
return flags;
}
-unsigned long perf_misc_flags(struct pt_regs *regs)
+unsigned long perf_arch_misc_flags(struct pt_regs *regs)
{
/* Check if the cpum_sf PMU has created the pt_regs structure.
* In this case, perf misc flags can be easily extracted. Otherwise,
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 920e3a640cad..b4a1a2576510 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -943,11 +943,12 @@ static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, u
static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ static atomic64_t status_warned = ATOMIC64_INIT(0);
+ u64 reserved, status, mask, new_bits, prev_bits;
struct perf_sample_data data;
struct hw_perf_event *hwc;
struct perf_event *event;
int handled = 0, idx;
- u64 reserved, status, mask;
bool pmu_enabled;
/*
@@ -1012,7 +1013,12 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
* the corresponding PMCs are expected to be inactive according to the
* active_mask
*/
- WARN_ON(status > 0);
+ if (status > 0) {
+ prev_bits = atomic64_fetch_or(status, &status_warned);
+ // A new bit was set for the very first time.
+ new_bits = status & ~prev_bits;
+ WARN(new_bits, "New overflows for inactive PMCs: %llx\n", new_bits);
+ }
/* Clear overflow and freeze bits */
amd_pmu_ack_global_status(~status);
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 0bfde2ea5cb8..49c26ce2b115 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -916,7 +916,8 @@ int amd_uncore_umc_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
u8 group_num_pmcs[UNCORE_GROUP_MAX] = { 0 };
union amd_uncore_info info;
struct amd_uncore_pmu *pmu;
- int index = 0, gid, i;
+ int gid, i;
+ u16 index = 0;
if (pmu_version < 2)
return 0;
@@ -948,7 +949,7 @@ int amd_uncore_umc_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
for_each_set_bit(gid, gmask, UNCORE_GROUP_MAX) {
for (i = 0; i < group_num_pmus[gid]; i++) {
pmu = &uncore->pmus[index];
- snprintf(pmu->name, sizeof(pmu->name), "amd_umc_%d", index);
+ snprintf(pmu->name, sizeof(pmu->name), "amd_umc_%hu", index);
pmu->num_counters = group_num_pmcs[gid] / group_num_pmus[gid];
pmu->msr_base = MSR_F19H_UMC_PERF_CTL + i * pmu->num_counters * 2;
pmu->rdpmc_base = -1;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 65ab6460aed4..c75c482d4c52 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -3003,35 +3003,57 @@ static unsigned long code_segment_base(struct pt_regs *regs)
return 0;
}
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
+unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
{
- if (perf_guest_state())
- return perf_guest_get_ip();
-
return regs->ip + code_segment_base(regs);
}
-unsigned long perf_misc_flags(struct pt_regs *regs)
+static unsigned long common_misc_flags(struct pt_regs *regs)
{
- unsigned int guest_state = perf_guest_state();
- int misc = 0;
+ if (regs->flags & PERF_EFLAGS_EXACT)
+ return PERF_RECORD_MISC_EXACT_IP;
- if (guest_state) {
- if (guest_state & PERF_GUEST_USER)
- misc |= PERF_RECORD_MISC_GUEST_USER;
- else
- misc |= PERF_RECORD_MISC_GUEST_KERNEL;
- } else {
- if (user_mode(regs))
- misc |= PERF_RECORD_MISC_USER;
- else
- misc |= PERF_RECORD_MISC_KERNEL;
- }
+ return 0;
+}
- if (regs->flags & PERF_EFLAGS_EXACT)
- misc |= PERF_RECORD_MISC_EXACT_IP;
+static unsigned long guest_misc_flags(struct pt_regs *regs)
+{
+ unsigned long guest_state = perf_guest_state();
+
+ if (!(guest_state & PERF_GUEST_ACTIVE))
+ return 0;
+
+ if (guest_state & PERF_GUEST_USER)
+ return PERF_RECORD_MISC_GUEST_USER;
+ else
+ return PERF_RECORD_MISC_GUEST_KERNEL;
+
+}
+
+static unsigned long host_misc_flags(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ return PERF_RECORD_MISC_USER;
+ else
+ return PERF_RECORD_MISC_KERNEL;
+}
+
+unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
+{
+ unsigned long flags = common_misc_flags(regs);
+
+ flags |= guest_misc_flags(regs);
+
+ return flags;
+}
+
+unsigned long perf_arch_misc_flags(struct pt_regs *regs)
+{
+ unsigned long flags = common_misc_flags(regs);
+
+ flags |= host_misc_flags(regs);
- return misc;
+ return flags;
}
void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index d879478db3f5..bb284aff7bfd 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3962,8 +3962,8 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
- if (!(event->attr.sample_type &
- ~intel_pmu_large_pebs_flags(event))) {
+ if (!(event->attr.sample_type & ~intel_pmu_large_pebs_flags(event)) &&
+ !has_aux_action(event)) {
event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
event->attach_state |= PERF_ATTACH_SCHED_CB;
}
@@ -4599,6 +4599,28 @@ static inline bool erratum_hsw11(struct perf_event *event)
X86_CONFIG(.event=0xc0, .umask=0x01);
}
+static struct event_constraint *
+arl_h_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_tiny)
+ return cmt_get_event_constraints(cpuc, idx, event);
+
+ return mtl_get_event_constraints(cpuc, idx, event);
+}
+
+static int arl_h_hw_config(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_tiny)
+ return intel_pmu_hw_config(event);
+
+ return adl_hw_config(event);
+}
+
/*
* The HSW11 requires a period larger than 100 which is the same as the BDM11.
* A minimum period of 128 is enforced as well for the INST_RETIRED.ALL.
@@ -4924,17 +4946,26 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void)
/*
* This essentially just maps between the 'hybrid_cpu_type'
- * and 'hybrid_pmu_type' enums:
+ * and 'hybrid_pmu_type' enums except for ARL-H processor
+ * which needs to compare atom uarch native id since ARL-H
+ * contains two different atom uarchs.
*/
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type;
+ u32 native_id;
- if (cpu_type == HYBRID_INTEL_CORE &&
- pmu_type == hybrid_big)
- return &x86_pmu.hybrid_pmu[i];
- if (cpu_type == HYBRID_INTEL_ATOM &&
- pmu_type == hybrid_small)
+ if (cpu_type == HYBRID_INTEL_CORE && pmu_type == hybrid_big)
return &x86_pmu.hybrid_pmu[i];
+ if (cpu_type == HYBRID_INTEL_ATOM) {
+ if (x86_pmu.num_hybrid_pmus == 2 && pmu_type == hybrid_small)
+ return &x86_pmu.hybrid_pmu[i];
+
+ native_id = get_this_hybrid_cpu_native_id();
+ if (native_id == skt_native_id && pmu_type == hybrid_small)
+ return &x86_pmu.hybrid_pmu[i];
+ if (native_id == cmt_native_id && pmu_type == hybrid_tiny)
+ return &x86_pmu.hybrid_pmu[i];
+ }
}
return NULL;
@@ -5965,6 +5996,37 @@ static struct attribute *lnl_hybrid_events_attrs[] = {
NULL
};
+/* The event string must be in PMU IDX order. */
+EVENT_ATTR_STR_HYBRID(topdown-retiring,
+ td_retiring_arl_h,
+ "event=0xc2,umask=0x02;event=0x00,umask=0x80;event=0xc2,umask=0x0",
+ hybrid_big_small_tiny);
+EVENT_ATTR_STR_HYBRID(topdown-bad-spec,
+ td_bad_spec_arl_h,
+ "event=0x73,umask=0x0;event=0x00,umask=0x81;event=0x73,umask=0x0",
+ hybrid_big_small_tiny);
+EVENT_ATTR_STR_HYBRID(topdown-fe-bound,
+ td_fe_bound_arl_h,
+ "event=0x9c,umask=0x01;event=0x00,umask=0x82;event=0x71,umask=0x0",
+ hybrid_big_small_tiny);
+EVENT_ATTR_STR_HYBRID(topdown-be-bound,
+ td_be_bound_arl_h,
+ "event=0xa4,umask=0x02;event=0x00,umask=0x83;event=0x74,umask=0x0",
+ hybrid_big_small_tiny);
+
+static struct attribute *arl_h_hybrid_events_attrs[] = {
+ EVENT_PTR(slots_adl),
+ EVENT_PTR(td_retiring_arl_h),
+ EVENT_PTR(td_bad_spec_arl_h),
+ EVENT_PTR(td_fe_bound_arl_h),
+ EVENT_PTR(td_be_bound_arl_h),
+ EVENT_PTR(td_heavy_ops_adl),
+ EVENT_PTR(td_br_mis_adl),
+ EVENT_PTR(td_fetch_lat_adl),
+ EVENT_PTR(td_mem_bound_adl),
+ NULL,
+};
+
/* Must be in IDX order */
EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small);
EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small);
@@ -5983,6 +6045,21 @@ static struct attribute *mtl_hybrid_mem_attrs[] = {
NULL
};
+EVENT_ATTR_STR_HYBRID(mem-loads,
+ mem_ld_arl_h,
+ "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3;event=0xd0,umask=0x5,ldlat=3",
+ hybrid_big_small_tiny);
+EVENT_ATTR_STR_HYBRID(mem-stores,
+ mem_st_arl_h,
+ "event=0xd0,umask=0x6;event=0xcd,umask=0x2;event=0xd0,umask=0x6",
+ hybrid_big_small_tiny);
+
+static struct attribute *arl_h_hybrid_mem_attrs[] = {
+ EVENT_PTR(mem_ld_arl_h),
+ EVENT_PTR(mem_st_arl_h),
+ NULL,
+};
+
EVENT_ATTR_STR_HYBRID(tx-start, tx_start_adl, "event=0xc9,umask=0x1", hybrid_big);
EVENT_ATTR_STR_HYBRID(tx-commit, tx_commit_adl, "event=0xc9,umask=0x2", hybrid_big);
EVENT_ATTR_STR_HYBRID(tx-abort, tx_abort_adl, "event=0xc9,umask=0x4", hybrid_big);
@@ -6006,8 +6083,8 @@ static struct attribute *adl_hybrid_tsx_attrs[] = {
FORMAT_ATTR_HYBRID(in_tx, hybrid_big);
FORMAT_ATTR_HYBRID(in_tx_cp, hybrid_big);
-FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small);
-FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small);
+FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small_tiny);
+FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small_tiny);
FORMAT_ATTR_HYBRID(frontend, hybrid_big);
#define ADL_HYBRID_RTM_FORMAT_ATTR \
@@ -6030,7 +6107,7 @@ static struct attribute *adl_hybrid_extra_attr[] = {
NULL
};
-FORMAT_ATTR_HYBRID(snoop_rsp, hybrid_small);
+FORMAT_ATTR_HYBRID(snoop_rsp, hybrid_small_tiny);
static struct attribute *mtl_hybrid_extra_attr_rtm[] = {
ADL_HYBRID_RTM_FORMAT_ATTR,
@@ -6238,8 +6315,9 @@ static inline int intel_pmu_v6_addr_offset(int index, bool eventsel)
}
static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = {
- { hybrid_small, "cpu_atom" },
- { hybrid_big, "cpu_core" },
+ { hybrid_small, "cpu_atom" },
+ { hybrid_big, "cpu_core" },
+ { hybrid_tiny, "cpu_lowpower" },
};
static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus)
@@ -6272,7 +6350,7 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus)
0, x86_pmu_num_counters(&pmu->pmu), 0, 0);
pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
- if (pmu->pmu_type & hybrid_small) {
+ if (pmu->pmu_type & hybrid_small_tiny) {
pmu->intel_cap.perf_metrics = 0;
pmu->intel_cap.pebs_output_pt_available = 1;
pmu->mid_ack = true;
@@ -7111,6 +7189,37 @@ __init int intel_pmu_init(void)
name = "lunarlake_hybrid";
break;
+ case INTEL_ARROWLAKE_H:
+ intel_pmu_init_hybrid(hybrid_big_small_tiny);
+
+ x86_pmu.pebs_latency_data = arl_h_latency_data;
+ x86_pmu.get_event_constraints = arl_h_get_event_constraints;
+ x86_pmu.hw_config = arl_h_hw_config;
+
+ td_attr = arl_h_hybrid_events_attrs;
+ mem_attr = arl_h_hybrid_mem_attrs;
+ tsx_attr = adl_hybrid_tsx_attrs;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr;
+
+ /* Initialize big core specific PerfMon capabilities. */
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
+ intel_pmu_init_lnc(&pmu->pmu);
+
+ /* Initialize Atom core specific PerfMon capabilities. */
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
+ intel_pmu_init_skt(&pmu->pmu);
+
+ /* Initialize Lower Power Atom specific PerfMon capabilities. */
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_TINY_IDX];
+ intel_pmu_init_grt(&pmu->pmu);
+ pmu->extra_regs = intel_cmt_extra_regs;
+
+ intel_pmu_pebs_data_source_arl_h();
+ pr_cont("ArrowLake-H Hybrid events, ");
+ name = "arrowlake_h_hybrid";
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index fa5ea65de0d0..8afc4ad3cd16 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -177,6 +177,17 @@ void __init intel_pmu_pebs_data_source_mtl(void)
__intel_pmu_pebs_data_source_cmt(data_source);
}
+void __init intel_pmu_pebs_data_source_arl_h(void)
+{
+ u64 *data_source;
+
+ intel_pmu_pebs_data_source_lnl();
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_TINY_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_cmt(data_source);
+}
+
void __init intel_pmu_pebs_data_source_cmt(void)
{
__intel_pmu_pebs_data_source_cmt(pebs_data_source);
@@ -388,6 +399,16 @@ u64 lnl_latency_data(struct perf_event *event, u64 status)
return lnc_latency_data(event, status);
}
+u64 arl_h_latency_data(struct perf_event *event, u64 status)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_tiny)
+ return cmt_latency_data(event, status);
+
+ return lnl_latency_data(event, status);
+}
+
static u64 load_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index fd4670a6694e..4b0373bc8ab4 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -418,6 +418,9 @@ static void pt_config_start(struct perf_event *event)
struct pt *pt = this_cpu_ptr(&pt_ctx);
u64 ctl = event->hw.aux_config;
+ if (READ_ONCE(event->hw.aux_paused))
+ return;
+
ctl |= RTIT_CTL_TRACEEN;
if (READ_ONCE(pt->vmx_on))
perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
@@ -534,7 +537,24 @@ static void pt_config(struct perf_event *event)
reg |= (event->attr.config & PT_CONFIG_MASK);
event->hw.aux_config = reg;
+
+ /*
+ * Allow resume before starting so as not to overwrite a value set by a
+ * PMI.
+ */
+ barrier();
+ WRITE_ONCE(pt->resume_allowed, 1);
+ /* Configuration is complete, it is now OK to handle an NMI */
+ barrier();
+ WRITE_ONCE(pt->handle_nmi, 1);
+ barrier();
pt_config_start(event);
+ barrier();
+ /*
+ * Allow pause after starting so its pt_config_stop() doesn't race with
+ * pt_config_start().
+ */
+ WRITE_ONCE(pt->pause_allowed, 1);
}
static void pt_config_stop(struct perf_event *event)
@@ -828,11 +848,13 @@ static void pt_buffer_advance(struct pt_buffer *buf)
buf->cur_idx++;
if (buf->cur_idx == buf->cur->last) {
- if (buf->cur == buf->last)
+ if (buf->cur == buf->last) {
buf->cur = buf->first;
- else
+ buf->wrapped = true;
+ } else {
buf->cur = list_entry(buf->cur->list.next, struct topa,
list);
+ }
buf->cur_idx = 0;
}
}
@@ -846,8 +868,11 @@ static void pt_buffer_advance(struct pt_buffer *buf)
static void pt_update_head(struct pt *pt)
{
struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ bool wrapped = buf->wrapped;
u64 topa_idx, base, old;
+ buf->wrapped = false;
+
if (buf->single) {
local_set(&buf->data_size, buf->output_off);
return;
@@ -865,7 +890,7 @@ static void pt_update_head(struct pt *pt)
} else {
old = (local64_xchg(&buf->head, base) &
((buf->nr_pages << PAGE_SHIFT) - 1));
- if (base < old)
+ if (base < old || (base == old && wrapped))
base += buf->nr_pages << PAGE_SHIFT;
local_add(base - old, &buf->data_size);
@@ -1511,6 +1536,7 @@ void intel_pt_interrupt(void)
buf = perf_aux_output_begin(&pt->handle, event);
if (!buf) {
event->hw.state = PERF_HES_STOPPED;
+ WRITE_ONCE(pt->resume_allowed, 0);
return;
}
@@ -1519,6 +1545,7 @@ void intel_pt_interrupt(void)
ret = pt_buffer_reset_markers(buf, &pt->handle);
if (ret) {
perf_aux_output_end(&pt->handle, 0);
+ WRITE_ONCE(pt->resume_allowed, 0);
return;
}
@@ -1573,6 +1600,26 @@ static void pt_event_start(struct perf_event *event, int mode)
struct pt *pt = this_cpu_ptr(&pt_ctx);
struct pt_buffer *buf;
+ if (mode & PERF_EF_RESUME) {
+ if (READ_ONCE(pt->resume_allowed)) {
+ u64 status;
+
+ /*
+ * Only if the trace is not active and the error and
+ * stopped bits are clear, is it safe to start, but a
+ * PMI might have just cleared these, so resume_allowed
+ * must be checked again also.
+ */
+ rdmsrl(MSR_IA32_RTIT_STATUS, status);
+ if (!(status & (RTIT_STATUS_TRIGGEREN |
+ RTIT_STATUS_ERROR |
+ RTIT_STATUS_STOPPED)) &&
+ READ_ONCE(pt->resume_allowed))
+ pt_config_start(event);
+ }
+ return;
+ }
+
buf = perf_aux_output_begin(&pt->handle, event);
if (!buf)
goto fail_stop;
@@ -1583,7 +1630,6 @@ static void pt_event_start(struct perf_event *event, int mode)
goto fail_end_stop;
}
- WRITE_ONCE(pt->handle_nmi, 1);
hwc->state = 0;
pt_config_buffer(buf);
@@ -1601,6 +1647,12 @@ static void pt_event_stop(struct perf_event *event, int mode)
{
struct pt *pt = this_cpu_ptr(&pt_ctx);
+ if (mode & PERF_EF_PAUSE) {
+ if (READ_ONCE(pt->pause_allowed))
+ pt_config_stop(event);
+ return;
+ }
+
/*
* Protect against the PMI racing with disabling wrmsr,
* see comment in intel_pt_interrupt().
@@ -1608,6 +1660,15 @@ static void pt_event_stop(struct perf_event *event, int mode)
WRITE_ONCE(pt->handle_nmi, 0);
barrier();
+ /*
+ * Prevent a resume from attempting to restart tracing, or a pause
+ * during a subsequent start. Do this after clearing handle_nmi so that
+ * pt_event_snapshot_aux() will not re-allow them.
+ */
+ WRITE_ONCE(pt->pause_allowed, 0);
+ WRITE_ONCE(pt->resume_allowed, 0);
+ barrier();
+
pt_config_stop(event);
if (event->hw.state == PERF_HES_STOPPED)
@@ -1657,6 +1718,10 @@ static long pt_event_snapshot_aux(struct perf_event *event,
if (WARN_ON_ONCE(!buf->snapshot))
return 0;
+ /* Prevent pause/resume from attempting to start/stop tracing */
+ WRITE_ONCE(pt->pause_allowed, 0);
+ WRITE_ONCE(pt->resume_allowed, 0);
+ barrier();
/*
* There is no PT interrupt in this mode, so stop the trace and it will
* remain stopped while the buffer is copied.
@@ -1676,8 +1741,13 @@ static long pt_event_snapshot_aux(struct perf_event *event,
* Here, handle_nmi tells us if the tracing was on.
* If the tracing was on, restart it.
*/
- if (READ_ONCE(pt->handle_nmi))
+ if (READ_ONCE(pt->handle_nmi)) {
+ WRITE_ONCE(pt->resume_allowed, 1);
+ barrier();
pt_config_start(event);
+ barrier();
+ WRITE_ONCE(pt->pause_allowed, 1);
+ }
return ret;
}
@@ -1793,7 +1863,9 @@ static __init int pt_init(void)
if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG;
- pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+ pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE |
+ PERF_PMU_CAP_ITRACE |
+ PERF_PMU_CAP_AUX_PAUSE;
pt_pmu.pmu.attr_groups = pt_attr_groups;
pt_pmu.pmu.task_ctx_nr = perf_sw_context;
pt_pmu.pmu.event_init = pt_event_init;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index f5e46c04c145..7ee94fc6d7cb 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -65,6 +65,7 @@ struct pt_pmu {
* @head: logical write offset inside the buffer
* @snapshot: if this is for a snapshot/overwrite counter
* @single: use Single Range Output instead of ToPA
+ * @wrapped: buffer advance wrapped back to the first topa table
* @stop_pos: STOP topa entry index
* @intr_pos: INT topa entry index
* @stop_te: STOP topa entry pointer
@@ -82,6 +83,7 @@ struct pt_buffer {
local64_t head;
bool snapshot;
bool single;
+ bool wrapped;
long stop_pos, intr_pos;
struct topa_entry *stop_te, *intr_te;
void **data_pages;
@@ -117,6 +119,8 @@ struct pt_filters {
* @filters: last configured filters
* @handle_nmi: do handle PT PMI on this cpu, there's an active event
* @vmx_on: 1 if VMX is ON on this cpu
+ * @pause_allowed: PERF_EF_PAUSE is allowed to stop tracing
+ * @resume_allowed: PERF_EF_RESUME is allowed to start tracing
* @output_base: cached RTIT_OUTPUT_BASE MSR value
* @output_mask: cached RTIT_OUTPUT_MASK MSR value
*/
@@ -125,6 +129,8 @@ struct pt {
struct pt_filters filters;
int handle_nmi;
int vmx_on;
+ int pause_allowed;
+ int resume_allowed;
u64 output_base;
u64 output_mask;
};
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ac1182141bf6..82c6f45ce975 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -668,24 +668,38 @@ enum {
#define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10
#define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1)
+/*
+ * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture
+ * of the core. Bits 31-24 indicates its core type (Core or Atom)
+ * and Bits [23:0] indicates the native model ID of the core.
+ * Core type and native model ID are defined in below enumerations.
+ */
enum hybrid_cpu_type {
HYBRID_INTEL_NONE,
HYBRID_INTEL_ATOM = 0x20,
HYBRID_INTEL_CORE = 0x40,
};
+#define X86_HYBRID_PMU_ATOM_IDX 0
+#define X86_HYBRID_PMU_CORE_IDX 1
+#define X86_HYBRID_PMU_TINY_IDX 2
+
enum hybrid_pmu_type {
not_hybrid,
- hybrid_small = BIT(0),
- hybrid_big = BIT(1),
-
- hybrid_big_small = hybrid_big | hybrid_small, /* only used for matching */
+ hybrid_small = BIT(X86_HYBRID_PMU_ATOM_IDX),
+ hybrid_big = BIT(X86_HYBRID_PMU_CORE_IDX),
+ hybrid_tiny = BIT(X86_HYBRID_PMU_TINY_IDX),
+
+ /* The belows are only used for matching */
+ hybrid_big_small = hybrid_big | hybrid_small,
+ hybrid_small_tiny = hybrid_small | hybrid_tiny,
+ hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny,
};
-#define X86_HYBRID_PMU_ATOM_IDX 0
-#define X86_HYBRID_PMU_CORE_IDX 1
-
-#define X86_HYBRID_NUM_PMUS 2
+enum atom_native_id {
+ cmt_native_id = 0x2, /* Crestmont */
+ skt_native_id = 0x3, /* Skymont */
+};
struct x86_hybrid_pmu {
struct pmu pmu;
@@ -1578,6 +1592,8 @@ u64 cmt_latency_data(struct perf_event *event, u64 status);
u64 lnl_latency_data(struct perf_event *event, u64 status);
+u64 arl_h_latency_data(struct perf_event *event, u64 status);
+
extern struct event_constraint intel_core2_pebs_event_constraints[];
extern struct event_constraint intel_atom_pebs_event_constraints[];
@@ -1697,6 +1713,8 @@ void intel_pmu_pebs_data_source_grt(void);
void intel_pmu_pebs_data_source_mtl(void);
+void intel_pmu_pebs_data_source_arl_h(void);
+
void intel_pmu_pebs_data_source_cmt(void);
void intel_pmu_pebs_data_source_lnl(void);
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index a481a939862e..a8defc813c36 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -148,7 +148,6 @@ struct rapl_model {
/* 1/2^hw_unit Joule */
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
static struct rapl_pmus *rapl_pmus;
-static cpumask_t rapl_cpu_mask;
static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
@@ -369,8 +368,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
-
if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
return -EINVAL;
@@ -389,7 +386,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
pmu = cpu_to_rapl_pmu(event->cpu);
if (!pmu)
return -EINVAL;
- event->cpu = pmu->cpu;
event->pmu_private = pmu;
event->hw.event_base = rapl_msrs[bit].msr;
event->hw.config = cfg;
@@ -403,23 +399,6 @@ static void rapl_pmu_event_read(struct perf_event *event)
rapl_event_update(event);
}
-static ssize_t rapl_get_attr_cpumask(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
-
-static struct attribute *rapl_pmu_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL,
-};
-
-static struct attribute_group rapl_pmu_attr_group = {
- .attrs = rapl_pmu_attrs,
-};
-
RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
@@ -467,7 +446,6 @@ static struct attribute_group rapl_pmu_format_group = {
};
static const struct attribute_group *rapl_attr_groups[] = {
- &rapl_pmu_attr_group,
&rapl_pmu_format_group,
&rapl_pmu_events_group,
NULL,
@@ -570,65 +548,6 @@ static struct perf_msr amd_rapl_msrs[] = {
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
};
-static int rapl_cpu_offline(unsigned int cpu)
-{
- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
- int target;
-
- /* Check if exiting cpu is used for collecting rapl events */
- if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
- return 0;
-
- pmu->cpu = -1;
- /* Find a new cpu to collect rapl events */
- target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu);
-
- /* Migrate rapl events to the new target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &rapl_cpu_mask);
- pmu->cpu = target;
- perf_pmu_migrate_context(pmu->pmu, cpu, target);
- }
- return 0;
-}
-
-static int rapl_cpu_online(unsigned int cpu)
-{
- s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu);
- if (rapl_pmu_idx < 0) {
- pr_err("topology_logical_(package/die)_id() returned a negative value");
- return -EINVAL;
- }
- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
- int target;
-
- if (!pmu) {
- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
- if (!pmu)
- return -ENOMEM;
-
- raw_spin_lock_init(&pmu->lock);
- INIT_LIST_HEAD(&pmu->active_list);
- pmu->pmu = &rapl_pmus->pmu;
- pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
- rapl_hrtimer_init(pmu);
-
- rapl_pmus->pmus[rapl_pmu_idx] = pmu;
- }
-
- /*
- * Check if there is an online cpu in the package which collects rapl
- * events already.
- */
- target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu));
- if (target < nr_cpu_ids)
- return 0;
-
- cpumask_set_cpu(cpu, &rapl_cpu_mask);
- pmu->cpu = cpu;
- return 0;
-}
-
static int rapl_check_hw_unit(struct rapl_model *rm)
{
u64 msr_rapl_power_unit_bits;
@@ -707,12 +626,41 @@ static const struct attribute_group *rapl_attr_update[] = {
NULL,
};
+static int __init init_rapl_pmu(void)
+{
+ struct rapl_pmu *pmu;
+ int idx;
+
+ for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
+ pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
+ if (!pmu)
+ goto free;
+
+ raw_spin_lock_init(&pmu->lock);
+ INIT_LIST_HEAD(&pmu->active_list);
+ pmu->pmu = &rapl_pmus->pmu;
+ pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ rapl_hrtimer_init(pmu);
+
+ rapl_pmus->pmus[idx] = pmu;
+ }
+
+ return 0;
+free:
+ for (; idx > 0; idx--)
+ kfree(rapl_pmus->pmus[idx - 1]);
+ return -ENOMEM;
+}
+
static int __init init_rapl_pmus(void)
{
int nr_rapl_pmu = topology_max_packages();
+ int rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
- if (!rapl_pmu_is_pkg_scope())
+ if (!rapl_pmu_is_pkg_scope()) {
nr_rapl_pmu *= topology_max_dies_per_package();
+ rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
+ }
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
@@ -728,9 +676,11 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.start = rapl_pmu_event_start;
rapl_pmus->pmu.stop = rapl_pmu_event_stop;
rapl_pmus->pmu.read = rapl_pmu_event_read;
+ rapl_pmus->pmu.scope = rapl_pmu_scope;
rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
- return 0;
+
+ return init_rapl_pmu();
}
static struct rapl_model model_snb = {
@@ -876,24 +826,13 @@ static int __init rapl_pmu_init(void)
if (ret)
return ret;
- /*
- * Install callbacks. Core will call them for each online cpu.
- */
- ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
- "perf/x86/rapl:online",
- rapl_cpu_online, rapl_cpu_offline);
- if (ret)
- goto out;
-
ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
if (ret)
- goto out1;
+ goto out;
rapl_advertise();
return 0;
-out1:
- cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
out:
pr_warn("Initialization failed (%d), disabled\n", ret);
cleanup_rapl_pmus();
@@ -903,7 +842,6 @@ module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void)
{
- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
perf_pmu_unregister(&rapl_pmus->pmu);
cleanup_rapl_pmus();
}
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index aa30fd8cad7f..5af69b5be2fb 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,6 +32,7 @@ extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
extern bool handle_guest_split_lock(unsigned long ip);
extern void handle_bus_lock(struct pt_regs *regs);
u8 get_this_hybrid_cpu_type(void);
+u32 get_this_hybrid_cpu_native_id(void);
#else
static inline void __init sld_setup(struct cpuinfo_x86 *c) {}
static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
@@ -50,6 +51,11 @@ static inline u8 get_this_hybrid_cpu_type(void)
{
return 0;
}
+
+static inline u32 get_this_hybrid_cpu_native_id(void)
+{
+ return 0;
+}
#endif
#ifdef CONFIG_IA32_FEAT_CTL
void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 91b73571412f..d95f902acc52 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -536,15 +536,17 @@ struct x86_perf_regs {
u64 *xmm_regs;
};
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-#define perf_misc_flags(regs) perf_misc_flags(regs)
+extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_arch_misc_flags(struct pt_regs *regs);
+extern unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs);
+#define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs)
+#define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs)
#include <asm/stacktrace.h>
/*
- * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
- * and the comment with PERF_EFLAGS_EXACT.
+ * We abuse bit 3 from flags to pass exact information, see
+ * perf_arch_misc_flags() and the comment with PERF_EFLAGS_EXACT.
*/
#define perf_arch_fetch_caller_regs(regs, __ip) { \
(regs)->ip = (__ip); \
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index e7656cbef68d..624397e43ac6 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1299,3 +1299,18 @@ u8 get_this_hybrid_cpu_type(void)
return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT;
}
+
+/**
+ * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU
+ *
+ * Returns the uarch native ID [23:0] of a CPU in a hybrid processor.
+ * If the processor is not hybrid, returns 0.
+ */
+u32 get_this_hybrid_cpu_native_id(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return 0;
+
+ return cpuid_eax(0x0000001a) &
+ (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1);
+}
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 61d9a66d1807..f1d151c03164 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -208,7 +208,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_UNCORE_ONLINE,
CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
- CPUHP_AP_PERF_X86_RAPL_ONLINE,
CPUHP_AP_PERF_S390_CF_ONLINE,
CPUHP_AP_PERF_S390_SF_ONLINE,
CPUHP_AP_PERF_ARM_CCI_ONLINE,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb908843f209..cb99ec8c9e96 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -170,6 +170,12 @@ struct hw_perf_event {
};
struct { /* aux / Intel-PT */
u64 aux_config;
+ /*
+ * For AUX area events, aux_paused cannot be a state
+ * flag because it can be updated asynchronously to
+ * state.
+ */
+ unsigned int aux_paused;
};
struct { /* software */
struct hrtimer hrtimer;
@@ -294,6 +300,7 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_NO_EXCLUDE 0x0040
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
+#define PERF_PMU_CAP_AUX_PAUSE 0x0200
/**
* pmu::scope
@@ -384,6 +391,8 @@ struct pmu {
#define PERF_EF_START 0x01 /* start the counter when adding */
#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */
#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */
+#define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */
+#define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */
/*
* Adds/Removes a counter to/from the PMU, can be done inside a
@@ -423,6 +432,18 @@ struct pmu {
*
* ->start() with PERF_EF_RELOAD will reprogram the counter
* value, must be preceded by a ->stop() with PERF_EF_UPDATE.
+ *
+ * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not
+ * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with
+ * PERF_EF_RESUME.
+ *
+ * ->start() with PERF_EF_RESUME will start as simply as possible but
+ * only if the counter is not otherwise stopped. Will not overlap
+ * another ->start() with PERF_EF_RESUME nor ->stop() with
+ * PERF_EF_PAUSE.
+ *
+ * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other
+ * ->stop()/->start() invocations, just not itself.
*/
void (*start) (struct perf_event *event, int flags);
void (*stop) (struct perf_event *event, int flags);
@@ -1655,15 +1676,35 @@ extern void perf_tp_event(u16 event_type, u64 count, void *record,
struct task_struct *task);
extern void perf_bp_event(struct perf_event *event, void *data);
-#ifndef perf_misc_flags
-# define perf_misc_flags(regs) \
+extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs);
+extern unsigned long perf_instruction_pointer(struct perf_event *event,
+ struct pt_regs *regs);
+
+#ifndef perf_arch_misc_flags
+# define perf_arch_misc_flags(regs) \
(user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
-# define perf_instruction_pointer(regs) instruction_pointer(regs)
+# define perf_arch_instruction_pointer(regs) instruction_pointer(regs)
#endif
#ifndef perf_arch_bpf_user_pt_regs
# define perf_arch_bpf_user_pt_regs(regs) regs
#endif
+#ifndef perf_arch_guest_misc_flags
+static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
+{
+ unsigned long guest_state = perf_guest_state();
+
+ if (!(guest_state & PERF_GUEST_ACTIVE))
+ return 0;
+
+ if (guest_state & PERF_GUEST_USER)
+ return PERF_RECORD_MISC_GUEST_USER;
+ else
+ return PERF_RECORD_MISC_GUEST_KERNEL;
+}
+# define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs)
+#endif
+
static inline bool has_branch_stack(struct perf_event *event)
{
return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
@@ -1679,6 +1720,13 @@ static inline bool has_aux(struct perf_event *event)
return event->pmu->setup_aux;
}
+static inline bool has_aux_action(struct perf_event *event)
+{
+ return event->attr.aux_sample_size ||
+ event->attr.aux_pause ||
+ event->attr.aux_resume;
+}
+
static inline bool is_write_backward(struct perf_event *event)
{
return !!event->attr.write_backward;
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 2b294bf1881f..e0a4c2082245 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -15,6 +15,7 @@
#include <linux/rbtree.h>
#include <linux/types.h>
#include <linux/wait.h>
+#include <linux/timer.h>
struct uprobe;
struct vm_area_struct;
@@ -23,8 +24,17 @@ struct inode;
struct notifier_block;
struct page;
+/*
+ * Allowed return values from uprobe consumer's handler callback
+ * with following meaning:
+ *
+ * UPROBE_HANDLER_REMOVE
+ * - Remove the uprobe breakpoint from current->mm.
+ * UPROBE_HANDLER_IGNORE
+ * - Ignore ret_handler callback for this consumer.
+ */
#define UPROBE_HANDLER_REMOVE 1
-#define UPROBE_HANDLER_MASK 1
+#define UPROBE_HANDLER_IGNORE 2
#define MAX_URETPROBE_DEPTH 64
@@ -37,13 +47,15 @@ struct uprobe_consumer {
* for the current process. If filter() is omitted or returns true,
* UPROBE_HANDLER_REMOVE is effectively ignored.
*/
- int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
+ int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data);
int (*ret_handler)(struct uprobe_consumer *self,
unsigned long func,
- struct pt_regs *regs);
+ struct pt_regs *regs, __u64 *data);
bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
struct list_head cons_node;
+
+ __u64 id; /* set when uprobe_consumer is registered */
};
#ifdef CONFIG_UPROBES
@@ -56,12 +68,62 @@ enum uprobe_task_state {
UTASK_SSTEP_TRAPPED,
};
+/* The state of hybrid-lifetime uprobe inside struct return_instance */
+enum hprobe_state {
+ HPROBE_LEASED, /* uretprobes_srcu-protected uprobe */
+ HPROBE_STABLE, /* refcounted uprobe */
+ HPROBE_GONE, /* NULL uprobe, SRCU expired, refcount failed */
+ HPROBE_CONSUMED, /* uprobe "consumed" by uretprobe handler */
+};
+
+/*
+ * Hybrid lifetime uprobe. Represents a uprobe instance that could be either
+ * SRCU protected (with SRCU protection eventually potentially timing out),
+ * refcounted using uprobe->ref, or there could be no valid uprobe (NULL).
+ *
+ * hprobe's internal state is setup such that background timer thread can
+ * atomically "downgrade" temporarily RCU-protected uprobe into refcounted one
+ * (or no uprobe, if refcounting failed).
+ *
+ * *stable* pointer always point to the uprobe (or could be NULL if there is
+ * was no valid underlying uprobe to begin with).
+ *
+ * *leased* pointer is the key to achieving race-free atomic lifetime state
+ * transition and can have three possible states:
+ * - either the same non-NULL value as *stable*, in which case uprobe is
+ * SRCU-protected;
+ * - NULL, in which case uprobe (if there is any) is refcounted;
+ * - special __UPROBE_DEAD value, which represents an uprobe that was SRCU
+ * protected initially, but SRCU period timed out and we attempted to
+ * convert it to refcounted, but refcount_inc_not_zero() failed, because
+ * uprobe effectively went away (the last consumer unsubscribed). In this
+ * case it's important to know that *stable* pointer (which still has
+ * non-NULL uprobe pointer) shouldn't be used, because lifetime of
+ * underlying uprobe is not guaranteed anymore. __UPROBE_DEAD is just an
+ * internal marker and is handled transparently by hprobe_fetch() helper.
+ *
+ * When uprobe is SRCU-protected, we also record srcu_idx value, necessary for
+ * SRCU unlocking.
+ *
+ * See hprobe_expire() and hprobe_fetch() for details of race-free uprobe
+ * state transitioning details. It all hinges on atomic xchg() over *leaded*
+ * pointer. *stable* pointer, once initially set, is not modified concurrently.
+ */
+struct hprobe {
+ enum hprobe_state state;
+ int srcu_idx;
+ struct uprobe *uprobe;
+};
+
/*
* uprobe_task: Metadata of a task while it singlesteps.
*/
struct uprobe_task {
enum uprobe_task_state state;
+ unsigned int depth;
+ struct return_instance *return_instances;
+
union {
struct {
struct arch_uprobe_task autask;
@@ -75,23 +137,30 @@ struct uprobe_task {
};
struct uprobe *active_uprobe;
+ struct timer_list ri_timer;
unsigned long xol_vaddr;
struct arch_uprobe *auprobe;
+};
- struct return_instance *return_instances;
- unsigned int depth;
+struct return_consumer {
+ __u64 cookie;
+ __u64 id;
};
struct return_instance {
- struct uprobe *uprobe;
+ struct hprobe hprobe;
unsigned long func;
unsigned long stack; /* stack pointer */
unsigned long orig_ret_vaddr; /* original return address */
bool chained; /* true, if instance is nested */
+ int consumers_cnt;
struct return_instance *next; /* keep as stack */
-};
+ struct rcu_head rcu;
+
+ struct return_consumer consumers[] __counted_by(consumers_cnt);
+} ____cacheline_aligned;
enum rp_check {
RP_CHECK_CALL,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 4842c36fdf80..0524d541d4e3 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -511,7 +511,16 @@ struct perf_event_attr {
__u16 sample_max_stack;
__u16 __reserved_2;
__u32 aux_sample_size;
- __u32 __reserved_3;
+
+ union {
+ __u32 aux_action;
+ struct {
+ __u32 aux_start_paused : 1, /* start AUX area tracing paused */
+ aux_pause : 1, /* on overflow, pause AUX area tracing */
+ aux_resume : 1, /* on overflow, resume AUX area tracing */
+ __reserved_3 : 29;
+ };
+ };
/*
* User provided data if sigtrap=1, passed back to user via
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b65446be00a7..5d4a54f50826 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2142,7 +2142,7 @@ static void perf_put_aux_event(struct perf_event *event)
static bool perf_need_aux_event(struct perf_event *event)
{
- return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+ return event->attr.aux_output || has_aux_action(event);
}
static int perf_get_aux_event(struct perf_event *event,
@@ -2167,6 +2167,10 @@ static int perf_get_aux_event(struct perf_event *event,
!perf_aux_output_match(event, group_leader))
return 0;
+ if ((event->attr.aux_pause || event->attr.aux_resume) &&
+ !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
+ return 0;
+
if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
return 0;
@@ -7003,6 +7007,29 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
#endif
+static bool should_sample_guest(struct perf_event *event)
+{
+ return !event->attr.exclude_guest && perf_guest_state();
+}
+
+unsigned long perf_misc_flags(struct perf_event *event,
+ struct pt_regs *regs)
+{
+ if (should_sample_guest(event))
+ return perf_arch_guest_misc_flags(regs);
+
+ return perf_arch_misc_flags(regs);
+}
+
+unsigned long perf_instruction_pointer(struct perf_event *event,
+ struct pt_regs *regs)
+{
+ if (should_sample_guest(event))
+ return perf_guest_get_ip();
+
+ return perf_arch_instruction_pointer(regs);
+}
+
static void
perf_output_sample_regs(struct perf_output_handle *handle,
struct pt_regs *regs, u64 mask)
@@ -7820,7 +7847,7 @@ void perf_prepare_sample(struct perf_sample_data *data,
__perf_event_header__init_id(data, event, filtered_sample_type);
if (filtered_sample_type & PERF_SAMPLE_IP) {
- data->ip = perf_instruction_pointer(regs);
+ data->ip = perf_instruction_pointer(event, regs);
data->sample_flags |= PERF_SAMPLE_IP;
}
@@ -7984,7 +8011,7 @@ void perf_prepare_header(struct perf_event_header *header,
{
header->type = PERF_RECORD_SAMPLE;
header->size = perf_sample_data_size(data, event);
- header->misc = perf_misc_flags(regs);
+ header->misc = perf_misc_flags(event, regs);
/*
* If you're adding more sample types here, you likely need to do
@@ -7997,6 +8024,49 @@ void perf_prepare_header(struct perf_event_header *header,
WARN_ON_ONCE(header->size & 7);
}
+static void __perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+ if (pause) {
+ if (!event->hw.aux_paused) {
+ event->hw.aux_paused = 1;
+ event->pmu->stop(event, PERF_EF_PAUSE);
+ }
+ } else {
+ if (event->hw.aux_paused) {
+ event->hw.aux_paused = 0;
+ event->pmu->start(event, PERF_EF_RESUME);
+ }
+ }
+}
+
+static void perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+ struct perf_buffer *rb;
+
+ if (WARN_ON_ONCE(!event))
+ return;
+
+ rb = ring_buffer_get(event);
+ if (!rb)
+ return;
+
+ scoped_guard (irqsave) {
+ /*
+ * Guard against self-recursion here. Another event could trip
+ * this same from NMI context.
+ */
+ if (READ_ONCE(rb->aux_in_pause_resume))
+ break;
+
+ WRITE_ONCE(rb->aux_in_pause_resume, 1);
+ barrier();
+ __perf_event_aux_pause(event, pause);
+ barrier();
+ WRITE_ONCE(rb->aux_in_pause_resume, 0);
+ }
+ ring_buffer_put(rb);
+}
+
static __always_inline int
__perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
@@ -9799,9 +9869,12 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle);
+ if (event->attr.aux_pause)
+ perf_event_aux_pause(event->aux_event, true);
+
if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
!bpf_overflow_handler(event, data, regs))
- return ret;
+ goto out;
/*
* XXX event_limit might not quite work as expected on inherited
@@ -9863,6 +9936,9 @@ static int __perf_event_overflow(struct perf_event *event,
event->pending_wakeup = 1;
irq_work_queue(&event->pending_irq);
}
+out:
+ if (event->attr.aux_resume)
+ perf_event_aux_pause(event->aux_event, false);
return ret;
}
@@ -12254,11 +12330,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
if (event->attr.aux_output &&
- !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
+ (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
+ event->attr.aux_pause || event->attr.aux_resume)) {
err = -EOPNOTSUPP;
goto err_pmu;
}
+ if (event->attr.aux_pause && event->attr.aux_resume) {
+ err = -EINVAL;
+ goto err_pmu;
+ }
+
+ if (event->attr.aux_start_paused) {
+ if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) {
+ err = -EOPNOTSUPP;
+ goto err_pmu;
+ }
+ event->hw.aux_paused = 1;
+ }
+
if (cgroup_fd != -1) {
err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
if (err)
@@ -13052,7 +13142,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
* Grouping is not supported for kernel events, neither is 'AUX',
* make sure the caller's intentions are adjusted.
*/
- if (attr->aux_output)
+ if (attr->aux_output || attr->aux_action)
return ERR_PTR(-EINVAL);
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index e072d995d670..249288d82b8d 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -52,6 +52,7 @@ struct perf_buffer {
void (*free_aux)(void *);
refcount_t aux_refcount;
int aux_in_sampling;
+ int aux_in_pause_resume;
void **aux_pages;
void *aux_priv;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4b52cb2ae6d6..a76ddc5fc982 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,9 @@
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/workqueue.h>
+#include <linux/srcu.h>
#include <linux/uprobes.h>
@@ -42,8 +45,6 @@ static struct rb_root uprobes_tree = RB_ROOT;
static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
-DEFINE_STATIC_SRCU(uprobes_srcu);
-
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
@@ -51,6 +52,9 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
+/* Covers return_instance's uprobe lifetime. */
+DEFINE_STATIC_SRCU(uretprobes_srcu);
+
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
@@ -62,10 +66,13 @@ struct uprobe {
struct list_head pending_list;
struct list_head consumers;
struct inode *inode; /* Also hold a ref to inode */
- struct rcu_head rcu;
+ union {
+ struct rcu_head rcu;
+ struct work_struct work;
+ };
loff_t offset;
loff_t ref_ctr_offset;
- unsigned long flags;
+ unsigned long flags; /* "unsigned long" so bitops work */
/*
* The generic code assumes that it has two members of unknown type
@@ -100,7 +107,6 @@ static LIST_HEAD(delayed_uprobe_list);
*/
struct xol_area {
wait_queue_head_t wq; /* if all slots are busy */
- atomic_t slot_count; /* number of in-use slots */
unsigned long *bitmap; /* 0 = free slot */
struct page *page;
@@ -620,17 +626,23 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
return !RB_EMPTY_NODE(&uprobe->rb_node);
}
-static void uprobe_free_rcu(struct rcu_head *rcu)
+static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu)
{
struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
kfree(uprobe);
}
-static void put_uprobe(struct uprobe *uprobe)
+static void uprobe_free_srcu(struct rcu_head *rcu)
{
- if (!refcount_dec_and_test(&uprobe->ref))
- return;
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+ call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace);
+}
+
+static void uprobe_free_deferred(struct work_struct *work)
+{
+ struct uprobe *uprobe = container_of(work, struct uprobe, work);
write_lock(&uprobes_treelock);
@@ -651,7 +663,162 @@ static void put_uprobe(struct uprobe *uprobe)
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
- call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
+ /* start srcu -> rcu_tasks_trace -> kfree chain */
+ call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu);
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+ if (!refcount_dec_and_test(&uprobe->ref))
+ return;
+
+ INIT_WORK(&uprobe->work, uprobe_free_deferred);
+ schedule_work(&uprobe->work);
+}
+
+/* Initialize hprobe as SRCU-protected "leased" uprobe */
+static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx)
+{
+ WARN_ON(!uprobe);
+ hprobe->state = HPROBE_LEASED;
+ hprobe->uprobe = uprobe;
+ hprobe->srcu_idx = srcu_idx;
+}
+
+/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */
+static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe)
+{
+ hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE;
+ hprobe->uprobe = uprobe;
+ hprobe->srcu_idx = -1;
+}
+
+/*
+ * hprobe_consume() fetches hprobe's underlying uprobe and detects whether
+ * uprobe is SRCU protected or is refcounted. hprobe_consume() can be
+ * used only once for a given hprobe.
+ *
+ * Caller has to call hprobe_finalize() and pass previous hprobe_state, so
+ * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever
+ * is appropriate.
+ */
+static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate)
+{
+ *hstate = xchg(&hprobe->state, HPROBE_CONSUMED);
+ switch (*hstate) {
+ case HPROBE_LEASED:
+ case HPROBE_STABLE:
+ return hprobe->uprobe;
+ case HPROBE_GONE: /* uprobe is NULL, no SRCU */
+ case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */
+ return NULL;
+ default:
+ WARN(1, "hprobe invalid state %d", *hstate);
+ return NULL;
+ }
+}
+
+/*
+ * Reset hprobe state and, if hprobe was LEASED, release SRCU lock.
+ * hprobe_finalize() can only be used from current context after
+ * hprobe_consume() call (which determines uprobe and hstate value).
+ */
+static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate)
+{
+ switch (hstate) {
+ case HPROBE_LEASED:
+ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+ break;
+ case HPROBE_STABLE:
+ put_uprobe(hprobe->uprobe);
+ break;
+ case HPROBE_GONE:
+ case HPROBE_CONSUMED:
+ break;
+ default:
+ WARN(1, "hprobe invalid state %d", hstate);
+ break;
+ }
+}
+
+/*
+ * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED)
+ * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of
+ * them can win the race to perform SRCU unlocking. Whoever wins must perform
+ * SRCU unlock.
+ *
+ * Returns underlying valid uprobe or NULL, if there was no underlying uprobe
+ * to begin with or we failed to bump its refcount and it's going away.
+ *
+ * Returned non-NULL uprobe can be still safely used within an ongoing SRCU
+ * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has
+ * an extra refcount for caller to assume and use. Otherwise, it's not
+ * guaranteed that returned uprobe has a positive refcount, so caller has to
+ * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current
+ * SRCU lock region. See dup_utask().
+ */
+static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get)
+{
+ enum hprobe_state hstate;
+
+ /*
+ * return_instance's hprobe is protected by RCU.
+ * Underlying uprobe is itself protected from reuse by SRCU.
+ */
+ lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu));
+
+ hstate = READ_ONCE(hprobe->state);
+ switch (hstate) {
+ case HPROBE_STABLE:
+ /* uprobe has positive refcount, bump refcount, if necessary */
+ return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe;
+ case HPROBE_GONE:
+ /*
+ * SRCU was unlocked earlier and we didn't manage to take
+ * uprobe refcnt, so it's effectively NULL
+ */
+ return NULL;
+ case HPROBE_CONSUMED:
+ /*
+ * uprobe was consumed, so it's effectively NULL as far as
+ * uretprobe processing logic is concerned
+ */
+ return NULL;
+ case HPROBE_LEASED: {
+ struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe);
+ /*
+ * Try to switch hprobe state, guarding against
+ * hprobe_consume() or another hprobe_expire() racing with us.
+ * Note, if we failed to get uprobe refcount, we use special
+ * HPROBE_GONE state to signal that hprobe->uprobe shouldn't
+ * be used as it will be freed after SRCU is unlocked.
+ */
+ if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) {
+ /* We won the race, we are the ones to unlock SRCU */
+ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+ return get ? get_uprobe(uprobe) : uprobe;
+ }
+
+ /*
+ * We lost the race, undo refcount bump (if it ever happened),
+ * unless caller would like an extra refcount anyways.
+ */
+ if (uprobe && !get)
+ put_uprobe(uprobe);
+ /*
+ * Even if hprobe_consume() or another hprobe_expire() wins
+ * the state update race and unlocks SRCU from under us, we
+ * still have a guarantee that underyling uprobe won't be
+ * freed due to ongoing caller's SRCU lock region, so we can
+ * return it regardless. Also, if `get` was true, we also have
+ * an extra ref for the caller to own. This is used in dup_utask().
+ */
+ return uprobe;
+ }
+ default:
+ WARN(1, "unknown hprobe state %d", hstate);
+ return NULL;
+ }
}
static __always_inline
@@ -706,7 +873,7 @@ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
struct rb_node *node;
unsigned int seq;
- lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
+ lockdep_assert(rcu_read_lock_trace_held());
do {
seq = read_seqcount_begin(&uprobes_seqcount);
@@ -825,8 +992,11 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
+ static atomic64_t id;
+
down_write(&uprobe->consumer_rwsem);
list_add_rcu(&uc->cons_node, &uprobe->consumers);
+ uc->id = (__u64) atomic64_inc_return(&id);
up_write(&uprobe->consumer_rwsem);
}
@@ -934,8 +1104,7 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
bool ret = false;
down_read(&uprobe->consumer_rwsem);
- list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
ret = consumer_filter(uc, mm);
if (ret)
break;
@@ -1156,7 +1325,8 @@ void uprobe_unregister_sync(void)
* unlucky enough caller can free consumer's memory and cause
* handler_chain() or handle_uretprobe_chain() to do an use-after-free.
*/
- synchronize_srcu(&uprobes_srcu);
+ synchronize_rcu_tasks_trace();
+ synchronize_srcu(&uretprobes_srcu);
}
EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
@@ -1240,19 +1410,18 @@ EXPORT_SYMBOL_GPL(uprobe_register);
int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{
struct uprobe_consumer *con;
- int ret = -ENOENT, srcu_idx;
+ int ret = -ENOENT;
down_write(&uprobe->register_rwsem);
- srcu_idx = srcu_read_lock(&uprobes_srcu);
- list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
if (con == uc) {
ret = register_for_each_vma(uprobe, add ? uc : NULL);
break;
}
}
- srcu_read_unlock(&uprobes_srcu, srcu_idx);
+ rcu_read_unlock_trace();
up_write(&uprobe->register_rwsem);
@@ -1475,9 +1644,15 @@ static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
return 0;
}
+static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+ return -EPERM;
+}
+
static const struct vm_special_mapping xol_mapping = {
.name = "[uprobes]",
.fault = xol_fault,
+ .mremap = xol_mremap,
};
/* Slot allocation for XOL */
@@ -1553,7 +1728,6 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
- atomic_set(&area->slot_count, 1);
insns = arch_uprobe_trampoline(&insns_size);
arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
@@ -1626,92 +1800,57 @@ void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
}
}
-/*
- * - search for a free slot.
- */
-static unsigned long xol_take_insn_slot(struct xol_area *area)
+static unsigned long xol_get_slot_nr(struct xol_area *area)
{
- unsigned long slot_addr;
- int slot_nr;
-
- do {
- slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
- if (slot_nr < UINSNS_PER_PAGE) {
- if (!test_and_set_bit(slot_nr, area->bitmap))
- break;
+ unsigned long slot_nr;
- slot_nr = UINSNS_PER_PAGE;
- continue;
- }
- wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
- } while (slot_nr >= UINSNS_PER_PAGE);
-
- slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
- atomic_inc(&area->slot_count);
+ slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
+ if (slot_nr < UINSNS_PER_PAGE) {
+ if (!test_and_set_bit(slot_nr, area->bitmap))
+ return slot_nr;
+ }
- return slot_addr;
+ return UINSNS_PER_PAGE;
}
/*
* xol_get_insn_slot - allocate a slot for xol.
- * Returns the allocated slot address or 0.
*/
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
+static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask)
{
- struct xol_area *area;
- unsigned long xol_vaddr;
+ struct xol_area *area = get_xol_area();
+ unsigned long slot_nr;
- area = get_xol_area();
if (!area)
- return 0;
+ return false;
- xol_vaddr = xol_take_insn_slot(area);
- if (unlikely(!xol_vaddr))
- return 0;
+ wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE);
- arch_uprobe_copy_ixol(area->page, xol_vaddr,
+ utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES;
+ arch_uprobe_copy_ixol(area->page, utask->xol_vaddr,
&uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
-
- return xol_vaddr;
+ return true;
}
/*
- * xol_free_insn_slot - If slot was earlier allocated by
- * @xol_get_insn_slot(), make the slot available for
- * subsequent requests.
+ * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot()
*/
-static void xol_free_insn_slot(struct task_struct *tsk)
+static void xol_free_insn_slot(struct uprobe_task *utask)
{
- struct xol_area *area;
- unsigned long vma_end;
- unsigned long slot_addr;
-
- if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
- return;
+ struct xol_area *area = current->mm->uprobes_state.xol_area;
+ unsigned long offset = utask->xol_vaddr - area->vaddr;
+ unsigned int slot_nr;
- slot_addr = tsk->utask->xol_vaddr;
- if (unlikely(!slot_addr))
+ utask->xol_vaddr = 0;
+ /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */
+ if (WARN_ON_ONCE(offset >= PAGE_SIZE))
return;
- area = tsk->mm->uprobes_state.xol_area;
- vma_end = area->vaddr + PAGE_SIZE;
- if (area->vaddr <= slot_addr && slot_addr < vma_end) {
- unsigned long offset;
- int slot_nr;
-
- offset = slot_addr - area->vaddr;
- slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
- if (slot_nr >= UINSNS_PER_PAGE)
- return;
-
- clear_bit(slot_nr, area->bitmap);
- atomic_dec(&area->slot_count);
- smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
- if (waitqueue_active(&area->wq))
- wake_up(&area->wq);
-
- tsk->utask->xol_vaddr = 0;
- }
+ slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
+ clear_bit(slot_nr, area->bitmap);
+ smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
+ if (waitqueue_active(&area->wq))
+ wake_up(&area->wq);
}
void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
@@ -1750,11 +1889,18 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
return instruction_pointer(regs);
}
-static struct return_instance *free_ret_instance(struct return_instance *ri)
+static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe)
{
struct return_instance *next = ri->next;
- put_uprobe(ri->uprobe);
- kfree(ri);
+
+ if (cleanup_hprobe) {
+ enum hprobe_state hstate;
+
+ (void)hprobe_consume(&ri->hprobe, &hstate);
+ hprobe_finalize(&ri->hprobe, hstate);
+ }
+
+ kfree_rcu(ri, rcu);
return next;
}
@@ -1770,18 +1916,50 @@ void uprobe_free_utask(struct task_struct *t)
if (!utask)
return;
- if (utask->active_uprobe)
- put_uprobe(utask->active_uprobe);
+ WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);
+
+ timer_delete_sync(&utask->ri_timer);
ri = utask->return_instances;
while (ri)
- ri = free_ret_instance(ri);
+ ri = free_ret_instance(ri, true /* cleanup_hprobe */);
- xol_free_insn_slot(t);
kfree(utask);
t->utask = NULL;
}
+#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
+
+#define for_each_ret_instance_rcu(pos, head) \
+ for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next))
+
+static void ri_timer(struct timer_list *timer)
+{
+ struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
+ struct return_instance *ri;
+
+ /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */
+ guard(srcu)(&uretprobes_srcu);
+ /* RCU protects return_instance from freeing. */
+ guard(rcu)();
+
+ for_each_ret_instance_rcu(ri, utask->return_instances)
+ hprobe_expire(&ri->hprobe, false);
+}
+
+static struct uprobe_task *alloc_utask(void)
+{
+ struct uprobe_task *utask;
+
+ utask = kzalloc(sizeof(*utask), GFP_KERNEL);
+ if (!utask)
+ return NULL;
+
+ timer_setup(&utask->ri_timer, ri_timer, 0);
+
+ return utask;
+}
+
/*
* Allocate a uprobe_task object for the task if necessary.
* Called when the thread hits a breakpoint.
@@ -1793,38 +1971,73 @@ void uprobe_free_utask(struct task_struct *t)
static struct uprobe_task *get_utask(void)
{
if (!current->utask)
- current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ current->utask = alloc_utask();
return current->utask;
}
+static size_t ri_size(int consumers_cnt)
+{
+ struct return_instance *ri;
+
+ return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt;
+}
+
+#define DEF_CNT 4
+
+static struct return_instance *alloc_return_instance(void)
+{
+ struct return_instance *ri;
+
+ ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL);
+ if (!ri)
+ return ZERO_SIZE_PTR;
+
+ ri->consumers_cnt = DEF_CNT;
+ return ri;
+}
+
+static struct return_instance *dup_return_instance(struct return_instance *old)
+{
+ size_t size = ri_size(old->consumers_cnt);
+
+ return kmemdup(old, size, GFP_KERNEL);
+}
+
static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
{
struct uprobe_task *n_utask;
struct return_instance **p, *o, *n;
+ struct uprobe *uprobe;
- n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ n_utask = alloc_utask();
if (!n_utask)
return -ENOMEM;
t->utask = n_utask;
+ /* protect uprobes from freeing, we'll need try_get_uprobe() them */
+ guard(srcu)(&uretprobes_srcu);
+
p = &n_utask->return_instances;
for (o = o_utask->return_instances; o; o = o->next) {
- n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+ n = dup_return_instance(o);
if (!n)
return -ENOMEM;
- *n = *o;
+ /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */
+ uprobe = hprobe_expire(&o->hprobe, true);
+
/*
- * uprobe's refcnt has to be positive at this point, kept by
- * utask->return_instances items; return_instances can't be
- * removed right now, as task is blocked due to duping; so
- * get_uprobe() is safe to use here.
+ * New utask will have stable properly refcounted uprobe or
+ * NULL. Even if we failed to get refcounted uprobe, we still
+ * need to preserve full set of return_instances for proper
+ * uretprobe handling and nesting in forked task.
*/
- get_uprobe(n->uprobe);
- n->next = NULL;
+ hprobe_init_stable(&n->hprobe, uprobe);
- *p = n;
+ n->next = NULL;
+ rcu_assign_pointer(*p, n);
p = &n->next;
+
n_utask->depth++;
}
@@ -1900,45 +2113,34 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
- ri = free_ret_instance(ri);
+ ri = free_ret_instance(ri, true /* cleanup_hprobe */);
utask->depth--;
}
- utask->return_instances = ri;
+ rcu_assign_pointer(utask->return_instances, ri);
}
-static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
+ struct return_instance *ri)
{
- struct return_instance *ri;
- struct uprobe_task *utask;
+ struct uprobe_task *utask = current->utask;
unsigned long orig_ret_vaddr, trampoline_vaddr;
bool chained;
+ int srcu_idx;
if (!get_xol_area())
- return;
-
- utask = get_utask();
- if (!utask)
- return;
+ goto free;
if (utask->depth >= MAX_URETPROBE_DEPTH) {
printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
" nestedness limit pid/tgid=%d/%d\n",
current->pid, current->tgid);
- return;
+ goto free;
}
- /* we need to bump refcount to store uprobe in utask */
- if (!try_get_uprobe(uprobe))
- return;
-
- ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
- if (!ri)
- goto fail;
-
trampoline_vaddr = uprobe_get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
if (orig_ret_vaddr == -1)
- goto fail;
+ goto free;
/* drop the entries invalidated by longjmp() */
chained = (orig_ret_vaddr == trampoline_vaddr);
@@ -1956,53 +2158,51 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
* attack from user-space.
*/
uprobe_warn(current, "handle tail call");
- goto fail;
+ goto free;
}
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
- ri->uprobe = uprobe;
+
+ /* __srcu_read_lock() because SRCU lock survives switch to user space */
+ srcu_idx = __srcu_read_lock(&uretprobes_srcu);
+
ri->func = instruction_pointer(regs);
ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
ri->chained = chained;
utask->depth++;
+
+ hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx);
ri->next = utask->return_instances;
- utask->return_instances = ri;
+ rcu_assign_pointer(utask->return_instances, ri);
+
+ mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD);
return;
-fail:
+free:
kfree(ri);
- put_uprobe(uprobe);
}
/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
- struct uprobe_task *utask;
- unsigned long xol_vaddr;
+ struct uprobe_task *utask = current->utask;
int err;
- utask = get_utask();
- if (!utask)
- return -ENOMEM;
-
if (!try_get_uprobe(uprobe))
return -EINVAL;
- xol_vaddr = xol_get_insn_slot(uprobe);
- if (!xol_vaddr) {
+ if (!xol_get_insn_slot(uprobe, utask)) {
err = -ENOMEM;
goto err_out;
}
- utask->xol_vaddr = xol_vaddr;
utask->vaddr = bp_vaddr;
-
err = arch_uprobe_pre_xol(&uprobe->arch, regs);
if (unlikely(err)) {
- xol_free_insn_slot(current);
+ xol_free_insn_slot(utask);
goto err_out;
}
@@ -2125,35 +2325,90 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
return uprobe;
}
+static struct return_instance*
+push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie)
+{
+ if (unlikely(ri == ZERO_SIZE_PTR))
+ return ri;
+
+ if (unlikely(idx >= ri->consumers_cnt)) {
+ struct return_instance *old_ri = ri;
+
+ ri->consumers_cnt += DEF_CNT;
+ ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL);
+ if (!ri) {
+ kfree(old_ri);
+ return ZERO_SIZE_PTR;
+ }
+ }
+
+ ri->consumers[idx].id = id;
+ ri->consumers[idx].cookie = cookie;
+ return ri;
+}
+
+static struct return_consumer *
+return_consumer_find(struct return_instance *ri, int *iter, int id)
+{
+ struct return_consumer *ric;
+ int idx = *iter;
+
+ for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) {
+ if (ric->id == id) {
+ *iter = idx + 1;
+ return ric;
+ }
+ }
+ return NULL;
+}
+
+static bool ignore_ret_handler(int rc)
+{
+ return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE;
+}
+
static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
- int remove = UPROBE_HANDLER_REMOVE;
- bool need_prep = false; /* prepare return uprobe, when needed */
- bool has_consumers = false;
+ bool has_consumers = false, remove = true;
+ struct return_instance *ri = NULL;
+ int push_idx = 0;
current->utask->auprobe = &uprobe->arch;
- list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ bool session = uc->handler && uc->ret_handler;
+ __u64 cookie = 0;
int rc = 0;
if (uc->handler) {
- rc = uc->handler(uc, regs);
- WARN(rc & ~UPROBE_HANDLER_MASK,
+ rc = uc->handler(uc, regs, &cookie);
+ WARN(rc < 0 || rc > 2,
"bad rc=0x%x from %ps()\n", rc, uc->handler);
}
- if (uc->ret_handler)
- need_prep = true;
-
- remove &= rc;
+ remove &= rc == UPROBE_HANDLER_REMOVE;
has_consumers = true;
+
+ if (!uc->ret_handler || ignore_ret_handler(rc))
+ continue;
+
+ if (!ri)
+ ri = alloc_return_instance();
+
+ if (session)
+ ri = push_consumer(ri, push_idx++, uc->id, cookie);
}
current->utask->auprobe = NULL;
- if (need_prep && !remove)
- prepare_uretprobe(uprobe, regs); /* put bp at return */
+ if (!ZERO_OR_NULL_PTR(ri)) {
+ /*
+ * The push_idx value has the final number of return consumers,
+ * and ri->consumers_cnt has number of allocated consumers.
+ */
+ ri->consumers_cnt = push_idx;
+ prepare_uretprobe(uprobe, regs, ri);
+ }
if (remove && has_consumers) {
down_read(&uprobe->register_rwsem);
@@ -2169,19 +2424,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
}
static void
-handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs)
{
- struct uprobe *uprobe = ri->uprobe;
+ struct return_consumer *ric;
struct uprobe_consumer *uc;
- int srcu_idx;
+ int ric_idx = 0;
+
+ /* all consumers unsubscribed meanwhile */
+ if (unlikely(!uprobe))
+ return;
- srcu_idx = srcu_read_lock(&uprobes_srcu);
- list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
- if (uc->ret_handler)
- uc->ret_handler(uc, ri->func, regs);
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ bool session = uc->handler && uc->ret_handler;
+
+ if (uc->ret_handler) {
+ ric = return_consumer_find(ri, &ric_idx, uc->id);
+ if (!session || ric)
+ uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL);
+ }
}
- srcu_read_unlock(&uprobes_srcu, srcu_idx);
+ rcu_read_unlock_trace();
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
@@ -2200,6 +2463,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
{
struct uprobe_task *utask;
struct return_instance *ri, *next;
+ struct uprobe *uprobe;
+ enum hprobe_state hstate;
bool valid;
utask = current->utask;
@@ -2230,21 +2495,24 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
* trampoline addresses on the stack are replaced with correct
* original return addresses
*/
- utask->return_instances = ri->next;
+ rcu_assign_pointer(utask->return_instances, ri->next);
+
+ uprobe = hprobe_consume(&ri->hprobe, &hstate);
if (valid)
- handle_uretprobe_chain(ri, regs);
- ri = free_ret_instance(ri);
+ handle_uretprobe_chain(ri, uprobe, regs);
+ hprobe_finalize(&ri->hprobe, hstate);
+
+ /* We already took care of hprobe, no need to waste more time on that. */
+ ri = free_ret_instance(ri, false /* !cleanup_hprobe */);
utask->depth--;
} while (ri != next);
} while (!valid);
- utask->return_instances = ri;
return;
- sigill:
+sigill:
uprobe_warn(current, "handle uretprobe, sending SIGILL.");
force_sig(SIGILL);
-
}
bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
@@ -2266,13 +2534,13 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
- int is_swbp, srcu_idx;
+ int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == uprobe_get_trampoline_vaddr())
return uprobe_handle_trampoline(regs);
- srcu_idx = srcu_read_lock(&uprobes_srcu);
+ rcu_read_lock_trace();
uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe) {
@@ -2330,7 +2598,7 @@ static void handle_swbp(struct pt_regs *regs)
out:
/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
- srcu_read_unlock(&uprobes_srcu, srcu_idx);
+ rcu_read_unlock_trace();
}
/*
@@ -2353,7 +2621,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
put_uprobe(uprobe);
utask->active_uprobe = NULL;
utask->state = UTASK_RUNNING;
- xol_free_insn_slot(current);
+ xol_free_insn_slot(utask);
spin_lock_irq(&current->sighand->siglock);
recalc_sigpending(); /* see uprobe_deny_signal() */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 630b763e5240..f86c78961708 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3240,7 +3240,8 @@ uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
}
static int
-uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
+uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
+ __u64 *data)
{
struct bpf_uprobe *uprobe;
@@ -3249,7 +3250,8 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
}
static int
-uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs)
+uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs,
+ __u64 *data)
{
struct bpf_uprobe *uprobe;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b30fc8fcd095..fed382b7881b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -89,9 +89,11 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
static int register_uprobe_event(struct trace_uprobe *tu);
static int unregister_uprobe_event(struct trace_uprobe *tu);
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+ __u64 *data);
static int uretprobe_dispatcher(struct uprobe_consumer *con,
- unsigned long func, struct pt_regs *regs);
+ unsigned long func, struct pt_regs *regs,
+ __u64 *data);
#ifdef CONFIG_STACK_GROWSUP
static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
@@ -1522,7 +1524,8 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
}
}
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+ __u64 *data)
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
@@ -1553,7 +1556,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
}
static int uretprobe_dispatcher(struct uprobe_consumer *con,
- unsigned long func, struct pt_regs *regs)
+ unsigned long func, struct pt_regs *regs,
+ __u64 *data)
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 8835761d9a12..12005e3dc3e4 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -461,7 +461,7 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
static int
uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
- struct pt_regs *regs)
+ struct pt_regs *regs, __u64 *data)
{
regs->ax = 0x12345678deadbeef;