summaryrefslogtreecommitdiff
path: root/rust/kernel/perf_event.rs
blob: 81fba34085033cd539535a2c3bc2a179071b54c2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2024 Dennis Kobert <dennis@kobert.dev>

//! Perf Event.
//!
//! C header: [`include/linux/perf_event.h`](srctree/include/linux/perf_event.h)

use crate::alloc::AllocError;
use crate::error::from_err_ptr;
use crate::prelude::*;
use crate::types::ARef;
use crate::{bindings, task::Task, types::Opaque};

/// Represents a type of performance event to monitor
#[derive(Debug, Clone, Copy)]
pub enum EventType {
    /// Hardware events like CPU cycles, instructions, cache misses
    Hardware(HardwareEvent),
    /// Software events like context switches, page faults
    Software(SoftwareEvent),
    /// Raw hardware-specific event configuration
    Raw(u64),
}

/// Hardware performance events that can be monitored
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub enum HardwareEvent {
    /// Total CPU cycles (both used and idle)
    /// Does not include cycles when the CPU is idle
    CpuCycles,

    /// Total instructions executed by the CPU
    /// Can be used with CpuCycles to calculate Instructions per Cycle (IPC)
    Instructions,

    /// Cache operations that reference the CPU's cache hierarchy
    /// Includes all cache levels (L1, L2, LLC)
    CacheReferences,

    /// Cache operations that miss the CPU's cache hierarchy
    /// Requires memory access from RAM or other CPUs
    CacheMisses,

    /// Total branch instructions executed
    /// Used to monitor program flow changes
    BranchInstructions,

    /// Branch instructions that were mispredicted
    /// Indicates branch prediction efficiency
    BranchMisses,

    /// Bus cycles, indicating memory/system bus activity
    /// Useful for monitoring memory bus utilization
    BusCycles,

    /// Cycles where the CPU front-end is stalled
    /// Indicates instruction fetch or decode bottlenecks
    StalledCyclesFrontend,

    /// Cycles where the CPU back-end is stalled
    /// Indicates execution bottlenecks like resource conflicts
    StalledCyclesBackend,

    /// Total CPU cycles, including idle cycles
    /// Counts at a constant rate regardless of CPU frequency changes
    RefCpuCycles,
}

/// Software performance events that can be monitored
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub enum SoftwareEvent {
    /// CPU clock, a high-resolution per-CPU timer
    /// Measures time spent on this CPU in nanoseconds
    CpuClock,

    /// Task clock, a high-resolution timer specific to the monitored task
    /// Measures time spent by this task on CPU in nanoseconds
    TaskClock,

    /// Total page faults (both minor and major)
    /// Triggered when a process accesses a memory page not currently mapped
    PageFaults,

    /// Process context switches
    /// Counts voluntary and involuntary context switches
    ContextSwitches,

    /// CPU migrations
    /// Counts when a process moves execution to a different CPU
    CpuMigrations,

    /// Minor page faults
    /// Page is in memory but not allocated to the process
    PageFaultsMin,

    /// Major page faults
    /// Page needs to be loaded from disk
    PageFaultsMaj,

    /// Memory alignment faults
    /// Occurs on unaligned memory accesses when they're not handled by hardware
    AlignmentFaults,

    /// Instruction emulation faults
    /// Occurs when the CPU needs to emulate an instruction in software
    EmulationFaults,
}

/// Perf Event Attr wrapper
#[repr(transparent)]
pub struct PerfEventAttr {
    inner: Opaque<bindings::perf_event_attr>,
}

impl PerfEventAttr {
    /// Create a new PerfEventAttr from raw attr
    pub fn from_raw(attr: bindings::perf_event_attr) -> Self {
        Self {
            inner: Opaque::new(attr),
        }
    }

    /// Get a mutable pointer to the inner attr
    pub fn as_inner(&self) -> *mut bindings::perf_event_attr {
        self.inner.get()
    }
}

/// Wrapper for sample data
pub struct SampleData {
    inner: Opaque<bindings::perf_sample_data>,
}

impl SampleData {
    /// Returns a reference to the underlying data
    pub fn get(&self) -> Option<&bindings::perf_sample_data> {
        let ptr = self.inner.get();
        if ptr.is_null() {
            return None;
        }
        Some(unsafe { &*ptr })
    }
}

/// Wrapper for the current register values when the overflow handler is called
pub struct Registers {
    inner: Opaque<bindings::pt_regs>,
}

impl Registers {
    /// Returns a reference to the underlying data
    pub fn get(&self) -> Option<&bindings::pt_regs> {
        let ptr = self.inner.get();
        if ptr.is_null() {
            return None;
        }
        Some(unsafe { &*ptr })
    }
}

/// Handler function for overflow events

/// Perf Event wrapper
#[repr(transparent)]
pub struct PerfEvent {
    inner: *mut bindings::perf_event,
}

/// Perf Event wrapper
#[repr(transparent)]
pub struct PerfEventRef {
    inner: *const bindings::perf_event,
}

// SAFETY: perf_event has internal locking for thread safety
unsafe impl Send for PerfEvent {}
unsafe impl Sync for PerfEvent {}

impl PerfEvent {
    /// Returns a raw pointer to the inner C struct.
    #[inline]
    pub fn as_ptr(&self) -> *mut bindings::perf_event {
        self.inner
    }

    /// Enable the event for counting
    pub fn enable(&self) {
        unsafe {
            bindings::perf_event_enable(self.as_ptr());
        }
    }

    /// Disable the event
    pub fn disable(&self) {
        unsafe {
            bindings::perf_event_disable(self.as_ptr());
        }
    }

    /// Read the current value of the event counter
    pub fn read(&self) -> u64 {
        let mut enabled = 0;
        let mut running = 0;

        unsafe { bindings::perf_event_read_value(self.as_ptr(), &mut enabled, &mut running) }
    }
}

impl Drop for PerfEvent {
    fn drop(&mut self) {
        if !self.inner.is_null() {
            let context_ptr =
                unsafe { *self.inner }.overflow_handler_context as *mut OverflowHandler;
            if !context_ptr.is_null() {
                let OverflowHandler { closure, dyn_fn } = unsafe { context_ptr.read() };
                let _ = dyn_fn;
                unsafe { KBox::from_raw(closure) };
            }
        }

        unsafe {
            bindings::perf_event_release_kernel(self.inner);
        }
    }
}

/// Builder for configuring a performance event
pub struct EventBuilder {
    event_type: EventType,
    sample_period: Option<u64>,
    sample_freq: Option<u64>,
    disabled: bool,
    inherit: bool,
    pinned: bool,
    exclusive: bool,
    exclude_user: bool,
    exclude_kernel: bool,
    exclude_hv: bool,
    exclude_idle: bool,
    cpu: Option<i32>,
    task: Option<ARef<Task>>,
    overflow_handler: Option<OverflowHandler>,
}

/// Error type for performance event operations
#[derive(Debug)]
pub enum Error {
    /// Event creation failed
    InvalidConfig,
    /// Invalid CPU specified
    InvalidCpu,
    /// Invalid task specified
    InvalidTask,
    /// MemoryAllocation Error
    Alloc(AllocError),
}

impl From<AllocError> for Error {
    fn from(value: AllocError) -> Self {
        Self::Alloc(value)
    }
}

// Implementation of From traits for event types
impl From<HardwareEvent> for u64 {
    fn from(event: HardwareEvent) -> Self {
        use HardwareEvent::*;
        match event {
            CpuCycles => bindings::perf_hw_id_PERF_COUNT_HW_CPU_CYCLES as u64,
            Instructions => bindings::perf_hw_id_PERF_COUNT_HW_INSTRUCTIONS as u64,
            CacheReferences => bindings::perf_hw_id_PERF_COUNT_HW_CACHE_REFERENCES as u64,
            CacheMisses => bindings::perf_hw_id_PERF_COUNT_HW_CACHE_MISSES as u64,
            BranchInstructions => bindings::perf_hw_id_PERF_COUNT_HW_BRANCH_INSTRUCTIONS as u64,
            BranchMisses => bindings::perf_hw_id_PERF_COUNT_HW_BRANCH_MISSES as u64,
            BusCycles => bindings::perf_hw_id_PERF_COUNT_HW_BUS_CYCLES as u64,
            StalledCyclesFrontend => {
                bindings::perf_hw_id_PERF_COUNT_HW_STALLED_CYCLES_FRONTEND as u64
            }
            StalledCyclesBackend => {
                bindings::perf_hw_id_PERF_COUNT_HW_STALLED_CYCLES_BACKEND as u64
            }
            RefCpuCycles => bindings::perf_hw_id_PERF_COUNT_HW_REF_CPU_CYCLES as u64,
        }
    }
}

impl From<SoftwareEvent> for u64 {
    fn from(event: SoftwareEvent) -> Self {
        use SoftwareEvent::*;
        match event {
            CpuClock => bindings::perf_sw_ids_PERF_COUNT_SW_CPU_CLOCK as u64,
            TaskClock => bindings::perf_sw_ids_PERF_COUNT_SW_TASK_CLOCK as u64,
            PageFaults => bindings::perf_sw_ids_PERF_COUNT_SW_PAGE_FAULTS as u64,
            ContextSwitches => bindings::perf_sw_ids_PERF_COUNT_SW_CONTEXT_SWITCHES as u64,
            CpuMigrations => bindings::perf_sw_ids_PERF_COUNT_SW_CPU_MIGRATIONS as u64,
            PageFaultsMin => bindings::perf_sw_ids_PERF_COUNT_SW_PAGE_FAULTS_MIN as u64,
            PageFaultsMaj => bindings::perf_sw_ids_PERF_COUNT_SW_PAGE_FAULTS_MAJ as u64,
            AlignmentFaults => bindings::perf_sw_ids_PERF_COUNT_SW_ALIGNMENT_FAULTS as u64,
            EmulationFaults => bindings::perf_sw_ids_PERF_COUNT_SW_EMULATION_FAULTS as u64,
        }
    }
}

impl EventBuilder {
    /// Create a new event builder for the given event type
    pub fn new(event_type: EventType) -> Self {
        Self {
            event_type,
            sample_period: None,
            sample_freq: None,
            disabled: false,
            inherit: false,
            pinned: false,
            exclusive: false,
            exclude_user: false,
            exclude_kernel: false,
            exclude_hv: false,
            exclude_idle: false,
            cpu: None,
            task: None,
            overflow_handler: None,
        }
    }

    /// Set the sampling period (number of events between samples)
    pub fn sample_period(mut self, period: u64) -> Self {
        self.sample_period = Some(period);
        self.sample_freq = None; // Period and frequency are mutually exclusive
        self
    }

    /// Set the sampling frequency (samples per second)
    pub fn sample_freq(mut self, freq: u64) -> Self {
        self.sample_freq = Some(freq);
        self.sample_period = None; // Period and frequency are mutually exclusive
        self
    }

    /// Start the event disabled (must be explicitly enabled)
    pub fn disabled(mut self) -> Self {
        self.disabled = true;
        self
    }

    /// Child tasks inherit this event
    pub fn inherit(mut self) -> Self {
        self.inherit = true;
        self
    }

    /// Event must always be on PMU
    pub fn pinned(mut self) -> Self {
        self.pinned = true;
        self
    }

    /// Only group on PMU
    pub fn exclusive(mut self) -> Self {
        self.exclusive = true;
        self
    }

    /// Don't count user-space events
    pub fn exclude_user(mut self) -> Self {
        self.exclude_user = true;
        self
    }

    /// Don't count kernel events
    pub fn exclude_kernel(mut self) -> Self {
        self.exclude_kernel = true;
        self
    }

    /// Don't count hypervisor events
    pub fn exclude_hv(mut self) -> Self {
        self.exclude_hv = true;
        self
    }

    /// Don't count when CPU is idle
    pub fn exclude_idle(mut self) -> Self {
        self.exclude_idle = true;
        self
    }

    /// Monitor events on a specific CPU (-1 for all CPUs)
    pub fn cpu(mut self, cpu: i32) -> Self {
        self.cpu = Some(cpu);
        self
    }

    /// Monitor events for a specific task (None for per-CPU mode)
    pub fn task(mut self, task: ARef<Task>) -> Self {
        self.task = Some(task);
        self
    }

    /// Set handler for overflow events
    pub fn on_overflow(mut self, handler: OverflowHandler) -> Self {
        self.overflow_handler = Some(handler);
        self
    }

    /// Build the perf event
    pub fn build(self) -> Result<PerfEvent, Error> {
        // Create the perf_event_attr structure
        let mut attr = bindings::perf_event_attr::default();

        // Set the event type and configuration
        attr.type_ = match self.event_type {
            EventType::Hardware(_) => bindings::perf_type_id_PERF_TYPE_HARDWARE,
            EventType::Software(_) => bindings::perf_type_id_PERF_TYPE_SOFTWARE,
            EventType::Raw(_) => bindings::perf_type_id_PERF_TYPE_RAW,
        } as u32;

        attr.size = core::mem::size_of::<bindings::perf_event_attr>() as u32;
        attr.config = match self.event_type {
            EventType::Hardware(hw) => hw.into(),
            EventType::Software(sw) => sw.into(),
            EventType::Raw(raw) => raw,
        };

        // Set sampling configuration
        if let Some(period) = self.sample_period {
            attr.__bindgen_anon_1.sample_period = period;
        } else if let Some(freq) = self.sample_freq {
            attr.__bindgen_anon_1.sample_freq = freq;
            attr.set_freq(1);
        }

        // Set the configuration bits using the bindgen-generated setters
        attr.set_disabled(self.disabled as u64);
        attr.set_inherit(self.inherit as u64);
        attr.set_pinned(self.pinned as u64);
        attr.set_exclusive(self.exclusive as u64);
        attr.set_exclude_user(self.exclude_user as u64);
        attr.set_exclude_kernel(self.exclude_kernel as u64);
        attr.set_exclude_hv(self.exclude_hv as u64);
        attr.set_exclude_idle(self.exclude_idle as u64);

        let perf_event_attr = PerfEventAttr::from_raw(attr);
        let cpu = self.cpu.unwrap_or(-1);

        // Create the perf event using the existing kernel interface
        let event = perf_event_create_kernel_counter(
            perf_event_attr,
            cpu,
            self.task,
            self.overflow_handler,
        )
        .map_err(|_| Error::InvalidConfig)?;

        Ok(event)
    }
}

/// Registers a new perf event counter
///
/// # Arguments
/// * `attr`: attributes of the counter to create
/// * `cpu`: cpu to which the counter is bound (-1 for all CPUs)
/// * `task`: task to profile (None for per-cpu)
/// * `overflow_handler`: callback to trigger when we hit the event
pub fn perf_event_create_kernel_counter(
    perf_event_attr: PerfEventAttr,
    cpu: i32,
    task: Option<ARef<Task>>,
    overflow: Option<OverflowHandler>,
) -> Result<PerfEvent, Error> {
    // Convert handler to C callback if provided
    // Create the perf event using kernel functions
    let raw_perf_event = unsafe {
        bindings::perf_event_create_kernel_counter(
            perf_event_attr.as_inner(),
            cpu,
            task.as_ref().map_or(core::ptr::null_mut(), |t| t.as_ptr()),
            overflow.is_some().then_some(overflow_trampoline),
            overflow.map_or(core::ptr::null_mut(), |x| {
                KBox::into_raw(KBox::new(x, crate::alloc::flags::GFP_KERNEL).unwrap())
                    as *mut crate::ffi::c_void
            }),
        )
    };

    if raw_perf_event.is_null() {
        pr_err!("event null");
        return Err(Error::InvalidConfig);
    }

    let result = from_err_ptr(raw_perf_event);
    match result {
        Err(e) => {
            pr_err!("Encountered error during creation of perf event");
            pr_err!("Error: {e:?}");
            Err(Error::InvalidConfig)
        }
        Ok(raw_event) => Ok(PerfEvent { inner: raw_event }),
    }
}

unsafe extern "C" fn overflow_trampoline(
    perf_event: *mut bindings::perf_event,
    sample_data: *mut bindings::perf_sample_data,
    registers: *mut bindings::pt_regs,
) {
    if perf_event.is_null() {
        return;
    }
    let context_ptr = unsafe { *perf_event }.overflow_handler_context as *mut OverflowHandler;
    if context_ptr.is_null() {
        return;
    }
    let context = &mut unsafe { context_ptr.read() };
    overflow_wrapper(perf_event, sample_data, registers, &mut *context.dyn_fn);
}
fn overflow_wrapper(
    perf_event: *mut bindings::perf_event,
    sample_data: *mut bindings::perf_sample_data,
    registers: *mut bindings::pt_regs,
    mut handler: impl FnMut(&PerfEventRef, &mut SampleData, &mut Registers),
) {
    if perf_event.is_null() || sample_data.is_null() || registers.is_null() {
        return;
    }
    handler(
        &PerfEventRef { inner: perf_event },
        &mut SampleData {
            inner: Opaque::new(unsafe { *sample_data }),
        },
        &mut Registers {
            inner: Opaque::new(unsafe { *registers }),
        },
    )
}
fn into_dyn(
    handler: impl Fn(&PerfEventRef, &mut SampleData, &mut Registers) + Send + Sync + 'static,
) -> Result<OverflowHandler, Error> {
    let b = KBox::new(handler, crate::alloc::flags::GFP_KERNEL)?;
    let b = Box::leak(b);
    let b_ptr = (b as *mut _) as *mut crate::ffi::c_void;
    let c = b as &'static mut (dyn FnMut(&PerfEventRef, &mut SampleData, &mut Registers)
                      + Send
                      + Sync
                      + 'static);

    Ok(OverflowHandler {
        closure: b_ptr,
        dyn_fn: c,
    })
}
/// Workaround for the missing support of using KBox as a fat pointer
pub struct OverflowHandler {
    closure: *mut crate::ffi::c_void,
    dyn_fn: &'static mut (dyn FnMut(&PerfEventRef, &mut SampleData, &mut Registers)
                      + Send
                      + Sync
                      + 'static),
}

impl OverflowHandler {
    /// Constructs a new overflow handler callback which is run when a performance counter overflows.
    ///
    /// # Safety
    /// The callback function is run in an NMI context:
    /// - Handler must be interrupt-safe
    /// - Handler must not block
    /// - Handler must not alloc
    /// - Handler must not panic
    pub unsafe fn new(
        handler: impl Fn(&PerfEventRef, &mut SampleData, &mut Registers) + Send + Sync + 'static,
    ) -> Result<Self, Error> {
        into_dyn(handler)
    }
}