// SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2024 Dennis Kobert //! Perf Event. //! //! C header: [`include/linux/perf_event.h`](srctree/include/linux/perf_event.h) use crate::alloc::AllocError; use crate::error::from_err_ptr; use crate::prelude::*; use crate::types::ARef; use crate::{bindings, task::Task, types::Opaque}; /// Represents a type of performance event to monitor #[derive(Debug, Clone, Copy)] pub enum EventType { /// Hardware events like CPU cycles, instructions, cache misses Hardware(HardwareEvent), /// Software events like context switches, page faults Software(SoftwareEvent), /// Raw hardware-specific event configuration Raw(u64), } /// Hardware performance events that can be monitored #[derive(Debug, Clone, Copy)] #[non_exhaustive] pub enum HardwareEvent { /// Total CPU cycles (both used and idle) /// Does not include cycles when the CPU is idle CpuCycles, /// Total instructions executed by the CPU /// Can be used with CpuCycles to calculate Instructions per Cycle (IPC) Instructions, /// Cache operations that reference the CPU's cache hierarchy /// Includes all cache levels (L1, L2, LLC) CacheReferences, /// Cache operations that miss the CPU's cache hierarchy /// Requires memory access from RAM or other CPUs CacheMisses, /// Total branch instructions executed /// Used to monitor program flow changes BranchInstructions, /// Branch instructions that were mispredicted /// Indicates branch prediction efficiency BranchMisses, /// Bus cycles, indicating memory/system bus activity /// Useful for monitoring memory bus utilization BusCycles, /// Cycles where the CPU front-end is stalled /// Indicates instruction fetch or decode bottlenecks StalledCyclesFrontend, /// Cycles where the CPU back-end is stalled /// Indicates execution bottlenecks like resource conflicts StalledCyclesBackend, /// Total CPU cycles, including idle cycles /// Counts at a constant rate regardless of CPU frequency changes RefCpuCycles, } /// Software performance events that can be monitored #[derive(Debug, Clone, Copy)] #[non_exhaustive] pub enum SoftwareEvent { /// CPU clock, a high-resolution per-CPU timer /// Measures time spent on this CPU in nanoseconds CpuClock, /// Task clock, a high-resolution timer specific to the monitored task /// Measures time spent by this task on CPU in nanoseconds TaskClock, /// Total page faults (both minor and major) /// Triggered when a process accesses a memory page not currently mapped PageFaults, /// Process context switches /// Counts voluntary and involuntary context switches ContextSwitches, /// CPU migrations /// Counts when a process moves execution to a different CPU CpuMigrations, /// Minor page faults /// Page is in memory but not allocated to the process PageFaultsMin, /// Major page faults /// Page needs to be loaded from disk PageFaultsMaj, /// Memory alignment faults /// Occurs on unaligned memory accesses when they're not handled by hardware AlignmentFaults, /// Instruction emulation faults /// Occurs when the CPU needs to emulate an instruction in software EmulationFaults, } /// Perf Event Attr wrapper #[repr(transparent)] pub struct PerfEventAttr { inner: Opaque, } impl PerfEventAttr { /// Create a new PerfEventAttr from raw attr pub fn from_raw(attr: bindings::perf_event_attr) -> Self { Self { inner: Opaque::new(attr), } } /// Get a mutable pointer to the inner attr pub fn as_inner(&self) -> *mut bindings::perf_event_attr { self.inner.get() } } /// Wrapper for sample data pub struct SampleData { inner: Opaque, } impl SampleData { /// Returns a reference to the underlying data pub fn get(&self) -> Option<&bindings::perf_sample_data> { let ptr = self.inner.get(); if ptr.is_null() { return None; } Some(unsafe { &*ptr }) } } /// Wrapper for the current register values when the overflow handler is called pub struct Registers { inner: Opaque, } impl Registers { /// Returns a reference to the underlying data pub fn get(&self) -> Option<&bindings::pt_regs> { let ptr = self.inner.get(); if ptr.is_null() { return None; } Some(unsafe { &*ptr }) } } /// Handler function for overflow events /// Perf Event wrapper #[repr(transparent)] pub struct PerfEvent { inner: *mut bindings::perf_event, } /// Perf Event wrapper #[repr(transparent)] pub struct PerfEventRef { inner: *const bindings::perf_event, } // SAFETY: perf_event has internal locking for thread safety unsafe impl Send for PerfEvent {} unsafe impl Sync for PerfEvent {} impl PerfEvent { /// Returns a raw pointer to the inner C struct. #[inline] pub fn as_ptr(&self) -> *mut bindings::perf_event { self.inner } /// Enable the event for counting pub fn enable(&self) { unsafe { bindings::perf_event_enable(self.as_ptr()); } } /// Disable the event pub fn disable(&self) { unsafe { bindings::perf_event_disable(self.as_ptr()); } } /// Read the current value of the event counter pub fn read(&self) -> u64 { let mut enabled = 0; let mut running = 0; unsafe { bindings::perf_event_read_value(self.as_ptr(), &mut enabled, &mut running) } } } impl Drop for PerfEvent { fn drop(&mut self) { if !self.inner.is_null() { let context_ptr = unsafe { *self.inner }.overflow_handler_context as *mut OverflowHandler; if !context_ptr.is_null() { let OverflowHandler { closure, dyn_fn } = unsafe { context_ptr.read() }; let _ = dyn_fn; unsafe { KBox::from_raw(closure) }; } } unsafe { bindings::perf_event_release_kernel(self.inner); } } } /// Builder for configuring a performance event pub struct EventBuilder { event_type: EventType, sample_period: Option, sample_freq: Option, disabled: bool, inherit: bool, pinned: bool, exclusive: bool, exclude_user: bool, exclude_kernel: bool, exclude_hv: bool, exclude_idle: bool, cpu: Option, task: Option>, overflow_handler: Option, } /// Error type for performance event operations #[derive(Debug)] pub enum Error { /// Event creation failed InvalidConfig, /// Invalid CPU specified InvalidCpu, /// Invalid task specified InvalidTask, /// MemoryAllocation Error Alloc(AllocError), } impl From for Error { fn from(value: AllocError) -> Self { Self::Alloc(value) } } // Implementation of From traits for event types impl From for u64 { fn from(event: HardwareEvent) -> Self { use HardwareEvent::*; match event { CpuCycles => bindings::perf_hw_id_PERF_COUNT_HW_CPU_CYCLES as u64, Instructions => bindings::perf_hw_id_PERF_COUNT_HW_INSTRUCTIONS as u64, CacheReferences => bindings::perf_hw_id_PERF_COUNT_HW_CACHE_REFERENCES as u64, CacheMisses => bindings::perf_hw_id_PERF_COUNT_HW_CACHE_MISSES as u64, BranchInstructions => bindings::perf_hw_id_PERF_COUNT_HW_BRANCH_INSTRUCTIONS as u64, BranchMisses => bindings::perf_hw_id_PERF_COUNT_HW_BRANCH_MISSES as u64, BusCycles => bindings::perf_hw_id_PERF_COUNT_HW_BUS_CYCLES as u64, StalledCyclesFrontend => { bindings::perf_hw_id_PERF_COUNT_HW_STALLED_CYCLES_FRONTEND as u64 } StalledCyclesBackend => { bindings::perf_hw_id_PERF_COUNT_HW_STALLED_CYCLES_BACKEND as u64 } RefCpuCycles => bindings::perf_hw_id_PERF_COUNT_HW_REF_CPU_CYCLES as u64, } } } impl From for u64 { fn from(event: SoftwareEvent) -> Self { use SoftwareEvent::*; match event { CpuClock => bindings::perf_sw_ids_PERF_COUNT_SW_CPU_CLOCK as u64, TaskClock => bindings::perf_sw_ids_PERF_COUNT_SW_TASK_CLOCK as u64, PageFaults => bindings::perf_sw_ids_PERF_COUNT_SW_PAGE_FAULTS as u64, ContextSwitches => bindings::perf_sw_ids_PERF_COUNT_SW_CONTEXT_SWITCHES as u64, CpuMigrations => bindings::perf_sw_ids_PERF_COUNT_SW_CPU_MIGRATIONS as u64, PageFaultsMin => bindings::perf_sw_ids_PERF_COUNT_SW_PAGE_FAULTS_MIN as u64, PageFaultsMaj => bindings::perf_sw_ids_PERF_COUNT_SW_PAGE_FAULTS_MAJ as u64, AlignmentFaults => bindings::perf_sw_ids_PERF_COUNT_SW_ALIGNMENT_FAULTS as u64, EmulationFaults => bindings::perf_sw_ids_PERF_COUNT_SW_EMULATION_FAULTS as u64, } } } impl EventBuilder { /// Create a new event builder for the given event type pub fn new(event_type: EventType) -> Self { Self { event_type, sample_period: None, sample_freq: None, disabled: false, inherit: false, pinned: false, exclusive: false, exclude_user: false, exclude_kernel: false, exclude_hv: false, exclude_idle: false, cpu: None, task: None, overflow_handler: None, } } /// Set the sampling period (number of events between samples) pub fn sample_period(mut self, period: u64) -> Self { self.sample_period = Some(period); self.sample_freq = None; // Period and frequency are mutually exclusive self } /// Set the sampling frequency (samples per second) pub fn sample_freq(mut self, freq: u64) -> Self { self.sample_freq = Some(freq); self.sample_period = None; // Period and frequency are mutually exclusive self } /// Start the event disabled (must be explicitly enabled) pub fn disabled(mut self) -> Self { self.disabled = true; self } /// Child tasks inherit this event pub fn inherit(mut self) -> Self { self.inherit = true; self } /// Event must always be on PMU pub fn pinned(mut self) -> Self { self.pinned = true; self } /// Only group on PMU pub fn exclusive(mut self) -> Self { self.exclusive = true; self } /// Don't count user-space events pub fn exclude_user(mut self) -> Self { self.exclude_user = true; self } /// Don't count kernel events pub fn exclude_kernel(mut self) -> Self { self.exclude_kernel = true; self } /// Don't count hypervisor events pub fn exclude_hv(mut self) -> Self { self.exclude_hv = true; self } /// Don't count when CPU is idle pub fn exclude_idle(mut self) -> Self { self.exclude_idle = true; self } /// Monitor events on a specific CPU (-1 for all CPUs) pub fn cpu(mut self, cpu: i32) -> Self { self.cpu = Some(cpu); self } /// Monitor events for a specific task (None for per-CPU mode) pub fn task(mut self, task: ARef) -> Self { self.task = Some(task); self } /// Set handler for overflow events pub fn on_overflow(mut self, handler: OverflowHandler) -> Self { self.overflow_handler = Some(handler); self } /// Build the perf event pub fn build(self) -> Result { // Create the perf_event_attr structure let mut attr = bindings::perf_event_attr::default(); // Set the event type and configuration attr.type_ = match self.event_type { EventType::Hardware(_) => bindings::perf_type_id_PERF_TYPE_HARDWARE, EventType::Software(_) => bindings::perf_type_id_PERF_TYPE_SOFTWARE, EventType::Raw(_) => bindings::perf_type_id_PERF_TYPE_RAW, } as u32; attr.size = core::mem::size_of::() as u32; attr.config = match self.event_type { EventType::Hardware(hw) => hw.into(), EventType::Software(sw) => sw.into(), EventType::Raw(raw) => raw, }; // Set sampling configuration if let Some(period) = self.sample_period { attr.__bindgen_anon_1.sample_period = period; } else if let Some(freq) = self.sample_freq { attr.__bindgen_anon_1.sample_freq = freq; attr.set_freq(1); } // Set the configuration bits using the bindgen-generated setters attr.set_disabled(self.disabled as u64); attr.set_inherit(self.inherit as u64); attr.set_pinned(self.pinned as u64); attr.set_exclusive(self.exclusive as u64); attr.set_exclude_user(self.exclude_user as u64); attr.set_exclude_kernel(self.exclude_kernel as u64); attr.set_exclude_hv(self.exclude_hv as u64); attr.set_exclude_idle(self.exclude_idle as u64); let perf_event_attr = PerfEventAttr::from_raw(attr); let cpu = self.cpu.unwrap_or(-1); // Create the perf event using the existing kernel interface let event = perf_event_create_kernel_counter( perf_event_attr, cpu, self.task, self.overflow_handler, ) .map_err(|_| Error::InvalidConfig)?; Ok(event) } } /// Registers a new perf event counter /// /// # Arguments /// * `attr`: attributes of the counter to create /// * `cpu`: cpu to which the counter is bound (-1 for all CPUs) /// * `task`: task to profile (None for per-cpu) /// * `overflow_handler`: callback to trigger when we hit the event pub fn perf_event_create_kernel_counter( perf_event_attr: PerfEventAttr, cpu: i32, task: Option>, overflow: Option, ) -> Result { // Convert handler to C callback if provided // Create the perf event using kernel functions let raw_perf_event = unsafe { bindings::perf_event_create_kernel_counter( perf_event_attr.as_inner(), cpu, task.as_ref().map_or(core::ptr::null_mut(), |t| t.as_ptr()), overflow.is_some().then_some(overflow_trampoline), overflow.map_or(core::ptr::null_mut(), |x| { KBox::into_raw(KBox::new(x, crate::alloc::flags::GFP_KERNEL).unwrap()) as *mut crate::ffi::c_void }), ) }; if raw_perf_event.is_null() { pr_err!("event null"); return Err(Error::InvalidConfig); } let result = from_err_ptr(raw_perf_event); match result { Err(e) => { pr_err!("Encountered error during creation of perf event"); pr_err!("Error: {e:?}"); Err(Error::InvalidConfig) } Ok(raw_event) => Ok(PerfEvent { inner: raw_event }), } } unsafe extern "C" fn overflow_trampoline( perf_event: *mut bindings::perf_event, sample_data: *mut bindings::perf_sample_data, registers: *mut bindings::pt_regs, ) { if perf_event.is_null() { return; } let context_ptr = unsafe { *perf_event }.overflow_handler_context as *mut OverflowHandler; if context_ptr.is_null() { return; } let context = &mut unsafe { context_ptr.read() }; overflow_wrapper(perf_event, sample_data, registers, &mut *context.dyn_fn); } fn overflow_wrapper( perf_event: *mut bindings::perf_event, sample_data: *mut bindings::perf_sample_data, registers: *mut bindings::pt_regs, mut handler: impl FnMut(&PerfEventRef, &mut SampleData, &mut Registers), ) { if perf_event.is_null() || sample_data.is_null() || registers.is_null() { return; } handler( &PerfEventRef { inner: perf_event }, &mut SampleData { inner: Opaque::new(unsafe { *sample_data }), }, &mut Registers { inner: Opaque::new(unsafe { *registers }), }, ) } fn into_dyn( handler: impl Fn(&PerfEventRef, &mut SampleData, &mut Registers) + Send + Sync + 'static, ) -> Result { let b = KBox::new(handler, crate::alloc::flags::GFP_KERNEL)?; let b = Box::leak(b); let b_ptr = (b as *mut _) as *mut crate::ffi::c_void; let c = b as &'static mut (dyn FnMut(&PerfEventRef, &mut SampleData, &mut Registers) + Send + Sync + 'static); Ok(OverflowHandler { closure: b_ptr, dyn_fn: c, }) } /// Workaround for the missing support of using KBox as a fat pointer pub struct OverflowHandler { closure: *mut crate::ffi::c_void, dyn_fn: &'static mut (dyn FnMut(&PerfEventRef, &mut SampleData, &mut Registers) + Send + Sync + 'static), } impl OverflowHandler { /// Constructs a new overflow handler callback which is run when a performance counter overflows. /// /// # Safety /// The callback function is run in an NMI context: /// - Handler must be interrupt-safe /// - Handler must not block /// - Handler must not alloc /// - Handler must not panic pub unsafe fn new( handler: impl Fn(&PerfEventRef, &mut SampleData, &mut Registers) + Send + Sync + 'static, ) -> Result { into_dyn(handler) } }