summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-09-18 15:03:58 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2024-09-18 15:03:58 +0200
commit9f0c253ddddca608457a42e509267bed2dee0a50 (patch)
tree6e00e1ed61e997c06169c70c9365f213fe412fc8 /include
parent941c122da5c8355335dc16011c1c291a32cd1118 (diff)
parent5e645f31139183ac9a282238da18ca6bbc1c6f4a (diff)
Merge tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf events updates from Ingo Molnar: - Implement per-PMU context rescheduling to significantly improve single-PMU performance, and related cleanups/fixes (Peter Zijlstra and Namhyung Kim) - Fix ancient bug resulting in a lot of events being dropped erroneously at higher sampling frequencies (Luo Gengkun) - uprobes enhancements: - Implement RCU-protected hot path optimizations for better performance: "For baseline vs SRCU, peak througput increased from 3.7 M/s (million uprobe triggerings per second) up to about 8 M/s. For uretprobes it's a bit more modest with bump from 2.4 M/s to 5 M/s. For SRCU vs RCU Tasks Trace, peak throughput for uprobes increases further from 8 M/s to 10.3 M/s (+28%!), and for uretprobes from 5.3 M/s to 5.8 M/s (+11%), as we have more work to do on uretprobes side. Even single-thread (no contention) performance is slightly better: 3.276 M/s to 3.396 M/s (+3.5%) for uprobes, and 2.055 M/s to 2.174 M/s (+5.8%) for uretprobes." (Andrii Nakryiko et al) - Document mmap_lock, don't abuse get_user_pages_remote() (Oleg Nesterov) - Cleanups & fixes to prepare for future work: - Remove uprobe_register_refctr() - Simplify error handling for alloc_uprobe() - Make uprobe_register() return struct uprobe * - Fold __uprobe_unregister() into uprobe_unregister() - Shift put_uprobe() from delete_uprobe() to uprobe_unregister() - BPF: Fix use-after-free in bpf_uprobe_multi_link_attach() (Oleg Nesterov) - New feature & ABI extension: allow events to use PERF_SAMPLE READ with inheritance, enabling sample based profiling of a group of counters over a hierarchy of processes or threads (Ben Gainey) - Intel uncore & power events updates: - Add Arrow Lake and Lunar Lake support - Add PERF_EV_CAP_READ_SCOPE - Clean up and enhance cpumask and hotplug support (Kan Liang) - Add LNL uncore iMC freerunning support - Use D0:F0 as a default device (Zhenyu Wang) - Intel PT: fix AUX snapshot handling race (Adrian Hunter) - Misc fixes and cleanups (James Clark, Jiri Olsa, Oleg Nesterov and Peter Zijlstra) * tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (40 commits) dmaengine: idxd: Clean up cpumask and hotplug for perfmon iommu/vt-d: Clean up cpumask and hotplug for perfmon perf/x86/intel/cstate: Clean up cpumask and hotplug perf: Add PERF_EV_CAP_READ_SCOPE perf: Generic hotplug support for a PMU with a scope uprobes: perform lockless SRCU-protected uprobes_tree lookup rbtree: provide rb_find_rcu() / rb_find_add_rcu() perf/uprobe: split uprobe_unregister() uprobes: travers uprobe's consumer list locklessly under SRCU protection uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks uprobes: protected uprobe lifetime with SRCU uprobes: revamp uprobe refcounting and lifetime management bpf: Fix use-after-free in bpf_uprobe_multi_link_attach() perf/core: Fix small negative period being ignored perf: Really fix event_function_call() locking perf: Optimize __pmu_ctx_sched_out() perf: Add context time freeze perf: Fix event_function_call() locking perf: Extract a few helpers perf: Optimize context reschedule for single PMU cases ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/cpuhotplug.h2
-rw-r--r--include/linux/perf_event.h32
-rw-r--r--include/linux/rbtree.h67
-rw-r--r--include/linux/uprobes.h48
4 files changed, 122 insertions, 27 deletions
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index f39186738d81..2361ed4d2b15 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -153,7 +153,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
CPUHP_AP_PERF_X86_STARTING,
CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
- CPUHP_AP_PERF_X86_CSTATE_STARTING,
CPUHP_AP_PERF_XTENSA_STARTING,
CPUHP_AP_ARM_VFP_STARTING,
CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
@@ -210,7 +209,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
CPUHP_AP_PERF_X86_RAPL_ONLINE,
- CPUHP_AP_PERF_X86_CSTATE_ONLINE,
CPUHP_AP_PERF_S390_CF_ONLINE,
CPUHP_AP_PERF_S390_SF_ONLINE,
CPUHP_AP_PERF_ARM_CCI_ONLINE,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e336306b8c08..fb908843f209 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -168,6 +168,9 @@ struct hw_perf_event {
struct hw_perf_event_extra extra_reg;
struct hw_perf_event_extra branch_reg;
};
+ struct { /* aux / Intel-PT */
+ u64 aux_config;
+ };
struct { /* software */
struct hrtimer hrtimer;
};
@@ -292,6 +295,19 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
+/**
+ * pmu::scope
+ */
+enum perf_pmu_scope {
+ PERF_PMU_SCOPE_NONE = 0,
+ PERF_PMU_SCOPE_CORE,
+ PERF_PMU_SCOPE_DIE,
+ PERF_PMU_SCOPE_CLUSTER,
+ PERF_PMU_SCOPE_PKG,
+ PERF_PMU_SCOPE_SYS_WIDE,
+ PERF_PMU_MAX_SCOPE,
+};
+
struct perf_output_handle;
#define PMU_NULL_DEV ((void *)(~0UL))
@@ -315,6 +331,11 @@ struct pmu {
*/
int capabilities;
+ /*
+ * PMU scope
+ */
+ unsigned int scope;
+
int __percpu *pmu_disable_count;
struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
@@ -615,10 +636,13 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *,
* PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
* cannot be a group leader. If an event with this flag is detached from the
* group it is scheduled out and moved into an unrecoverable ERROR state.
+ * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the
+ * PMU scope where it is active.
*/
#define PERF_EV_CAP_SOFTWARE BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1)
#define PERF_EV_CAP_SIBLING BIT(2)
+#define PERF_EV_CAP_READ_SCOPE BIT(3)
#define SWEVENT_HLIST_BITS 8
#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS)
@@ -963,12 +987,16 @@ struct perf_event_context {
struct rcu_head rcu_head;
/*
- * Sum (event->pending_work + event->pending_work)
+ * The count of events for which using the switch-out fast path
+ * should be avoided.
+ *
+ * Sum (event->pending_work + events with
+ * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)))
*
* The SIGTRAP is targeted at ctx->task, as such it won't do changing
* that until the signal is delivered.
*/
- local_t nr_pending;
+ local_t nr_no_switch_fast;
};
struct perf_cpu_pmu_context {
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index f7edca369eda..7c173aa64e1e 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -245,6 +245,42 @@ rb_find_add(struct rb_node *node, struct rb_root *tree,
}
/**
+ * rb_find_add_rcu() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Adds a Store-Release for link_node.
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
+ int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node **link = &tree->rb_node;
+ struct rb_node *parent = NULL;
+ int c;
+
+ while (*link) {
+ parent = *link;
+ c = cmp(node, parent);
+
+ if (c < 0)
+ link = &parent->rb_left;
+ else if (c > 0)
+ link = &parent->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node_rcu(node, parent, link);
+ rb_insert_color(node, tree);
+ return NULL;
+}
+
+/**
* rb_find() - find @key in tree @tree
* @key: key to match
* @tree: tree to search
@@ -273,6 +309,37 @@ rb_find(const void *key, const struct rb_root *tree,
}
/**
+ * rb_find_rcu() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Notably, tree descent vs concurrent tree rotations is unsound and can result
+ * in false-negatives.
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_rcu(const void *key, const struct rb_root *tree,
+ int (*cmp)(const void *key, const struct rb_node *))
+{
+ struct rb_node *node = tree->rb_node;
+
+ while (node) {
+ int c = cmp(key, node);
+
+ if (c < 0)
+ node = rcu_dereference_raw(node->rb_left);
+ else if (c > 0)
+ node = rcu_dereference_raw(node->rb_right);
+ else
+ return node;
+ }
+
+ return NULL;
+}
+
+/**
* rb_find_first() - find the first @key in @tree
* @key: key to match
* @tree: tree to search
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b503fafb7fb3..2b294bf1881f 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -16,6 +16,7 @@
#include <linux/types.h>
#include <linux/wait.h>
+struct uprobe;
struct vm_area_struct;
struct mm_struct;
struct inode;
@@ -27,22 +28,22 @@ struct page;
#define MAX_URETPROBE_DEPTH 64
-enum uprobe_filter_ctx {
- UPROBE_FILTER_REGISTER,
- UPROBE_FILTER_UNREGISTER,
- UPROBE_FILTER_MMAP,
-};
-
struct uprobe_consumer {
+ /*
+ * handler() can return UPROBE_HANDLER_REMOVE to signal the need to
+ * unregister uprobe for current process. If UPROBE_HANDLER_REMOVE is
+ * returned, filter() callback has to be implemented as well and it
+ * should return false to "confirm" the decision to uninstall uprobe
+ * for the current process. If filter() is omitted or returns true,
+ * UPROBE_HANDLER_REMOVE is effectively ignored.
+ */
int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
int (*ret_handler)(struct uprobe_consumer *self,
unsigned long func,
struct pt_regs *regs);
- bool (*filter)(struct uprobe_consumer *self,
- enum uprobe_filter_ctx ctx,
- struct mm_struct *mm);
+ bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
- struct uprobe_consumer *next;
+ struct list_head cons_node;
};
#ifdef CONFIG_UPROBES
@@ -76,6 +77,8 @@ struct uprobe_task {
struct uprobe *active_uprobe;
unsigned long xol_vaddr;
+ struct arch_uprobe *auprobe;
+
struct return_instance *return_instances;
unsigned int depth;
};
@@ -110,10 +113,10 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
-extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
-extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
-extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
-extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
+extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
+extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
+extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
+extern void uprobe_unregister_sync(void);
extern int uprobe_mmap(struct vm_area_struct *vma);
extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
extern void uprobe_start_dup_mmap(void);
@@ -151,22 +154,21 @@ static inline void uprobes_init(void)
#define uprobe_get_trap_addr(regs) instruction_pointer(regs)
-static inline int
-uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
- return -ENOSYS;
-}
-static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+static inline struct uprobe *
+uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
- return -ENOSYS;
+ return ERR_PTR(-ENOSYS);
}
static inline int
-uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add)
+uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add)
{
return -ENOSYS;
}
static inline void
-uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+}
+static inline void uprobe_unregister_sync(void)
{
}
static inline int uprobe_mmap(struct vm_area_struct *vma)