From ee7751b564a90f337330efc1221df40647d68756 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Fri, 5 May 2023 13:58:55 -0700 Subject: tracing/user_events: Use long vs int for atomic bit ops Each event stores a int to track which bit to set/clear when enablement changes. On big endian 64-bit configurations, it's possible this could cause memory corruption when it's used for atomic bit operations. Use unsigned long for enablement values to ensure any possible corruption cannot occur. Downcast to int after mask for the bit target. Link: https://lore.kernel.org/all/6f758683-4e5e-41c3-9b05-9efc703e827c@kili.mountain/ Link: https://lore.kernel.org/linux-trace-kernel/20230505205855.6407-1-beaub@linux.microsoft.com Fixes: dcb8177c1395 ("tracing/user_events: Add ioctl for disabling addresses") Reported-by: Dan Carpenter Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b1ecd7677642..e37c7f168c44 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -101,7 +101,7 @@ struct user_event_enabler { unsigned long addr; /* Track enable bit, flags, etc. Aligned for bitops. */ - unsigned int values; + unsigned long values; }; /* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */ @@ -116,7 +116,9 @@ struct user_event_enabler { /* Only duplicate the bit value */ #define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK -#define ENABLE_BITOPS(e) ((unsigned long *)&(e)->values) +#define ENABLE_BITOPS(e) (&(e)->values) + +#define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK)) /* Used for asynchronous faulting in of pages */ struct user_event_enabler_fault { @@ -423,9 +425,9 @@ static int user_event_enabler_write(struct user_event_mm *mm, /* Update bit atomically, user tracers must be atomic as well */ if (enabler->event && enabler->event->status) - set_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); + set_bit(ENABLE_BIT(enabler), ptr); else - clear_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); + clear_bit(ENABLE_BIT(enabler), ptr); kunmap_local(kaddr); unpin_user_pages_dirty_lock(&page, 1, true); @@ -440,8 +442,7 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, struct user_event_enabler *next; list_for_each_entry_safe(enabler, next, &mm->enablers, link) { - if (enabler->addr == uaddr && - (enabler->values & ENABLE_VAL_BIT_MASK) == bit) + if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) return true; } @@ -2272,7 +2273,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) list_for_each_entry_safe(enabler, next, &mm->enablers, link) if (enabler->addr == reg.disable_addr && - (enabler->values & ENABLE_VAL_BIT_MASK) == reg.disable_bit) { + ENABLE_BIT(enabler) == reg.disable_bit) { set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) -- cgit v1.2.3-70-g09d2 From 632478a05821bc1c9b55c3a1dd0fb1be7bfa1acc Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 11 May 2023 18:32:01 +0200 Subject: tracing/timerlat: Always wakeup the timerlat thread While testing rtla timerlat auto analysis, I reach a condition where the interface was not receiving tracing data. I was able to manually reproduce the problem with these steps: # echo 0 > tracing_on # disable trace # echo 1 > osnoise/stop_tracing_us # stop trace if timerlat irq > 1 us # echo timerlat > current_tracer # enable timerlat tracer # sleep 1 # wait... that is the time when rtla # apply configs like prio or cgroup # echo 1 > tracing_on # start tracing # cat trace # tracer: timerlat # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> migrate-disable # |||| / delay # ||||| ACTIVATION # TASK-PID CPU# ||||| TIMESTAMP ID CONTEXT LATENCY # | | | ||||| | | | | NOTHING! Then, trying to enable tracing again with echo 1 > tracing_on resulted in no change: the trace was still not tracing. This problem happens because the timerlat IRQ hits the stop tracing condition while tracing is off, and do not wake up the timerlat thread, so the timerlat threads are kept sleeping forever, resulting in no trace, even after re-enabling the tracer. Avoid this condition by always waking up the threads, even after stopping tracing, allowing the tracer to return to its normal operating after a new tracing on. Link: https://lore.kernel.org/linux-trace-kernel/1ed8f830638b20a39d535d27d908e319a9a3c4e2.1683822622.git.bristot@kernel.org Cc: Juri Lelli Cc: stable@vger.kernel.org Fixes: a955d7eac177 ("trace: Add timerlat tracer") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index efbbec2caff8..e97e3fa5cbed 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1652,6 +1652,8 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) osnoise_stop_tracing(); notify_new_max_latency(diff); + wake_up_process(tlat->kthread); + return HRTIMER_NORESTART; } } -- cgit v1.2.3-70-g09d2 From 3e0fea09b17fa2255f6cb0108bbffd4a505f8925 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 May 2023 16:07:38 -0700 Subject: tracing/user_events: Split up mm alloc and attach When a new mm is being created in a fork() path it currently is allocated and then attached in one go. This leaves the mm exposed out to the tracing register callbacks while any parent enabler locations are copied in. This should not happen. Split up mm alloc and attach as unique operations. When duplicating enablers, first alloc, then duplicate, and only upon success, attach. This prevents any timing window outside of the event_reg mutex for enablement walking. This allows for dropping RCU requirement for enablement walking in later patches. Link: https://lkml.kernel.org/r/20230519230741.669-2-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=whTBvXJuoi_kACo3qi5WZUmRrhyA-_=rRFsycTytmB6qw@mail.gmail.com/ Signed-off-by: Linus Torvalds [ change log written by Beau Belgrave ] Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index e37c7f168c44..599aab46a94b 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -539,10 +539,9 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user) return found; } -static struct user_event_mm *user_event_mm_create(struct task_struct *t) +static struct user_event_mm *user_event_mm_alloc(struct task_struct *t) { struct user_event_mm *user_mm; - unsigned long flags; user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT); @@ -554,12 +553,6 @@ static struct user_event_mm *user_event_mm_create(struct task_struct *t) refcount_set(&user_mm->refcnt, 1); refcount_set(&user_mm->tasks, 1); - spin_lock_irqsave(&user_event_mms_lock, flags); - list_add_rcu(&user_mm->link, &user_event_mms); - spin_unlock_irqrestore(&user_event_mms_lock, flags); - - t->user_event_mm = user_mm; - /* * The lifetime of the memory descriptor can slightly outlast * the task lifetime if a ref to the user_event_mm is taken @@ -573,6 +566,17 @@ static struct user_event_mm *user_event_mm_create(struct task_struct *t) return user_mm; } +static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_struct *t) +{ + unsigned long flags; + + spin_lock_irqsave(&user_event_mms_lock, flags); + list_add_rcu(&user_mm->link, &user_event_mms); + spin_unlock_irqrestore(&user_event_mms_lock, flags); + + t->user_event_mm = user_mm; +} + static struct user_event_mm *current_user_event_mm(void) { struct user_event_mm *user_mm = current->user_event_mm; @@ -580,10 +584,12 @@ static struct user_event_mm *current_user_event_mm(void) if (user_mm) goto inc; - user_mm = user_event_mm_create(current); + user_mm = user_event_mm_alloc(current); if (!user_mm) goto error; + + user_event_mm_attach(user_mm, current); inc: refcount_inc(&user_mm->refcnt); error: @@ -671,7 +677,7 @@ void user_event_mm_remove(struct task_struct *t) void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) { - struct user_event_mm *mm = user_event_mm_create(t); + struct user_event_mm *mm = user_event_mm_alloc(t); struct user_event_enabler *enabler; if (!mm) @@ -685,10 +691,11 @@ void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) rcu_read_unlock(); + user_event_mm_attach(mm, t); return; error: rcu_read_unlock(); - user_event_mm_remove(t); + user_event_mm_destroy(mm); } static bool current_user_event_enabler_exists(unsigned long uaddr, -- cgit v1.2.3-70-g09d2 From aaecdaf922835ed9a8ce56cdd9a8d40fe630257a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 May 2023 16:07:39 -0700 Subject: tracing/user_events: Remove RCU lock while pinning pages pin_user_pages_remote() can reschedule which means we cannot hold any RCU lock while using it. Now that enablers are not exposed out to the tracing register callbacks during fork(), there is clearly no need to require the RCU lock as event_mutex is enough to protect changes. Remove unneeded RCU usages when pinning pages and walking enablers with event_mutex held. Cleanup a misleading "safe" list walk that is not needed. During fork() duplication, remove unneeded RCU list add, since the list is not exposed yet. Link: https://lkml.kernel.org/r/20230519230741.669-3-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wiiBfT4zNS29jA0XEsy8EmbqTH1hAPdRJCDAJMD8Gxt5A@mail.gmail.com/ Fixes: 7235759084a4 ("tracing/user_events: Use remote writes for event enablement") Signed-off-by: Linus Torvalds [ change log written by Beau Belgrave ] Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 599aab46a94b..d34a59630e70 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -439,9 +439,8 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, unsigned long uaddr, unsigned char bit) { struct user_event_enabler *enabler; - struct user_event_enabler *next; - list_for_each_entry_safe(enabler, next, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, link) { if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) return true; } @@ -456,19 +455,19 @@ static void user_event_enabler_update(struct user_event *user) struct user_event_mm *next; int attempt; + lockdep_assert_held(&event_mutex); + while (mm) { next = mm->next; mmap_read_lock(mm->mm); - rcu_read_lock(); - list_for_each_entry_rcu(enabler, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, link) { if (enabler->event == user) { attempt = 0; user_event_enabler_write(mm, enabler, true, &attempt); } } - rcu_read_unlock(); mmap_read_unlock(mm->mm); user_event_mm_put(mm); mm = next; @@ -496,7 +495,9 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, enabler->values = orig->values & ENABLE_VAL_DUP_MASK; refcount_inc(&enabler->event->refcnt); - list_add_rcu(&enabler->link, &mm->enablers); + + /* Enablers not exposed yet, RCU not required */ + list_add(&enabler->link, &mm->enablers); return true; } -- cgit v1.2.3-70-g09d2 From dcbd1ac2668b5fa02069ea96d581ca3f70a7543c Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Fri, 19 May 2023 16:07:40 -0700 Subject: tracing/user_events: Rename link fields for clarity Currently most list_head fields of various structs within user_events are simply named link. This causes folks to keep additional context in their head when working with the code, which can be confusing. Instead of using link, describe what the actual link is, for example: list_del_rcu(&mm->link); Changes into: list_del_rcu(&mm->mms_link); The reader now is given a hint the link is to the mms global list instead of having to remember or spot check within the code. Link: https://lkml.kernel.org/r/20230519230741.669-4-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wicngggxVpbnrYHjRTwGE0WYscPRM+L2HO2BF8ia1EXgQ@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 2 +- kernel/trace/trace_events_user.c | 40 ++++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 2847f5a18a86..17d452b389de 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -17,7 +17,7 @@ #ifdef CONFIG_USER_EVENTS struct user_event_mm { - struct list_head link; + struct list_head mms_link; struct list_head enablers; struct mm_struct *mm; struct user_event_mm *next; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index d34a59630e70..238c7a0615fa 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -96,7 +96,7 @@ struct user_event { * these to track enablement sites that are tied to an event. */ struct user_event_enabler { - struct list_head link; + struct list_head mm_enablers_link; struct user_event *event; unsigned long addr; @@ -155,7 +155,7 @@ struct user_event_file_info { #define VALIDATOR_REL (1 << 1) struct user_event_validator { - struct list_head link; + struct list_head user_event_link; int offset; int flags; }; @@ -261,7 +261,7 @@ error: static void user_event_enabler_destroy(struct user_event_enabler *enabler) { - list_del_rcu(&enabler->link); + list_del_rcu(&enabler->mm_enablers_link); /* No longer tracking the event via the enabler */ refcount_dec(&enabler->event->refcnt); @@ -440,7 +440,7 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, { struct user_event_enabler *enabler; - list_for_each_entry(enabler, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) return true; } @@ -461,7 +461,7 @@ static void user_event_enabler_update(struct user_event *user) next = mm->next; mmap_read_lock(mm->mm); - list_for_each_entry(enabler, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { if (enabler->event == user) { attempt = 0; user_event_enabler_write(mm, enabler, true, &attempt); @@ -497,7 +497,7 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, refcount_inc(&enabler->event->refcnt); /* Enablers not exposed yet, RCU not required */ - list_add(&enabler->link, &mm->enablers); + list_add(&enabler->mm_enablers_link, &mm->enablers); return true; } @@ -527,13 +527,15 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user) */ rcu_read_lock(); - list_for_each_entry_rcu(mm, &user_event_mms, link) - list_for_each_entry_rcu(enabler, &mm->enablers, link) + list_for_each_entry_rcu(mm, &user_event_mms, mms_link) { + list_for_each_entry_rcu(enabler, &mm->enablers, mm_enablers_link) { if (enabler->event == user) { mm->next = found; found = user_event_mm_get(mm); break; } + } + } rcu_read_unlock(); @@ -572,7 +574,7 @@ static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_stru unsigned long flags; spin_lock_irqsave(&user_event_mms_lock, flags); - list_add_rcu(&user_mm->link, &user_event_mms); + list_add_rcu(&user_mm->mms_link, &user_event_mms); spin_unlock_irqrestore(&user_event_mms_lock, flags); t->user_event_mm = user_mm; @@ -601,7 +603,7 @@ static void user_event_mm_destroy(struct user_event_mm *mm) { struct user_event_enabler *enabler, *next; - list_for_each_entry_safe(enabler, next, &mm->enablers, link) + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) user_event_enabler_destroy(enabler); mmdrop(mm->mm); @@ -638,7 +640,7 @@ void user_event_mm_remove(struct task_struct *t) /* Remove the mm from the list, so it can no longer be enabled */ spin_lock_irqsave(&user_event_mms_lock, flags); - list_del_rcu(&mm->link); + list_del_rcu(&mm->mms_link); spin_unlock_irqrestore(&user_event_mms_lock, flags); /* @@ -686,9 +688,10 @@ void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) rcu_read_lock(); - list_for_each_entry_rcu(enabler, &old_mm->enablers, link) + list_for_each_entry_rcu(enabler, &old_mm->enablers, mm_enablers_link) { if (!user_event_enabler_dup(enabler, mm)) goto error; + } rcu_read_unlock(); @@ -757,7 +760,7 @@ retry: */ if (!*write_result) { refcount_inc(&enabler->event->refcnt); - list_add_rcu(&enabler->link, &user_mm->enablers); + list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers); } mutex_unlock(&event_mutex); @@ -913,8 +916,8 @@ static void user_event_destroy_validators(struct user_event *user) struct user_event_validator *validator, *next; struct list_head *head = &user->validators; - list_for_each_entry_safe(validator, next, head, link) { - list_del(&validator->link); + list_for_each_entry_safe(validator, next, head, user_event_link) { + list_del(&validator->user_event_link); kfree(validator); } } @@ -968,7 +971,7 @@ add_validator: validator->offset = offset; /* Want sequential access when validating */ - list_add_tail(&validator->link, &user->validators); + list_add_tail(&validator->user_event_link, &user->validators); add_field: field->type = type; @@ -1358,7 +1361,7 @@ static int user_event_validate(struct user_event *user, void *data, int len) void *pos, *end = data + len; u32 loc, offset, size; - list_for_each_entry(validator, head, link) { + list_for_each_entry(validator, head, user_event_link) { pos = data + validator->offset; /* Already done min_size check, no bounds check here */ @@ -2279,7 +2282,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) */ mutex_lock(&event_mutex); - list_for_each_entry_safe(enabler, next, &mm->enablers, link) + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) { if (enabler->addr == reg.disable_addr && ENABLE_BIT(enabler) == reg.disable_bit) { set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); @@ -2290,6 +2293,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) /* Removed at least one */ ret = 0; } + } mutex_unlock(&event_mutex); -- cgit v1.2.3-70-g09d2 From ff9e1632d69e596d8ca256deb07433a8f3565038 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Fri, 19 May 2023 16:07:41 -0700 Subject: tracing/user_events: Document user_event_mm one-shot list usage During 6.4 development it became clear that the one-shot list used by the user_event_mm's next field was confusing to others. It is not clear how this list is protected or what the next field usage is for unless you are familiar with the code. Add comments into the user_event_mm struct indicating lock requirement and usage. Also document how and why this approach was used via comments in both user_event_enabler_update() and user_event_mm_get_all() and the rules to properly use it. Link: https://lkml.kernel.org/r/20230519230741.669-5-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wicngggxVpbnrYHjRTwGE0WYscPRM+L2HO2BF8ia1EXgQ@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 1 + kernel/trace/trace_events_user.c | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 17d452b389de..8afa8c3a0973 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -20,6 +20,7 @@ struct user_event_mm { struct list_head mms_link; struct list_head enablers; struct mm_struct *mm; + /* Used for one-shot lists, protected by event_mutex */ struct user_event_mm *next; refcount_t refcnt; refcount_t tasks; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 238c7a0615fa..dbb14705d0d3 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -451,12 +451,25 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, static void user_event_enabler_update(struct user_event *user) { struct user_event_enabler *enabler; - struct user_event_mm *mm = user_event_mm_get_all(user); struct user_event_mm *next; + struct user_event_mm *mm; int attempt; lockdep_assert_held(&event_mutex); + /* + * We need to build a one-shot list of all the mms that have an + * enabler for the user_event passed in. This list is only valid + * while holding the event_mutex. The only reason for this is due + * to the global mm list being RCU protected and we use methods + * which can wait (mmap_read_lock and pin_user_pages_remote). + * + * NOTE: user_event_mm_get_all() increments the ref count of each + * mm that is added to the list to prevent removal timing windows. + * We must always put each mm after they are used, which may wait. + */ + mm = user_event_mm_get_all(user); + while (mm) { next = mm->next; mmap_read_lock(mm->mm); @@ -515,6 +528,14 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user) struct user_event_enabler *enabler; struct user_event_mm *mm; + /* + * We use the mm->next field to build a one-shot list from the global + * RCU protected list. To build this list the event_mutex must be held. + * This lets us build a list without requiring allocs that could fail + * when user based events are most wanted for diagnostics. + */ + lockdep_assert_held(&event_mutex); + /* * We do not want to block fork/exec while enablements are being * updated, so we use RCU to walk the current tasks that have used -- cgit v1.2.3-70-g09d2 From e30fbc618e97b38dbb49f1d44dcd0778d3f23b8c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 23 May 2023 22:11:08 -0400 Subject: tracing/histograms: Allow variables to have some modifiers Modifiers are used to change the behavior of keys. For instance, they can grouped into buckets, converted to syscall names (from the syscall identifier), show task->comm of the current pid, be an array of longs that represent a stacktrace, and more. It was found that nothing stopped a value from taking a modifier. As values are simple counters. If this happened, it would call code that was not expecting a modifier and crash the kernel. This was fixed by having the ___create_val_field() function test if a modifier was present and fail if one was. This fixed the crash. Now there's a problem with variables. Variables are used to pass fields from one event to another. Variables are allowed to have some modifiers, as the processing may need to happen at the time of the event (like stacktraces and comm names of the current pid). The issue is that it too uses __create_val_field(). Now that fails on modifiers, variables can no longer use them (this is a regression). As not all modifiers are for variables, have them use a separate check. Link: https://lore.kernel.org/linux-trace-kernel/20230523221108.064a5d82@rorschach.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Tom Zanussi Cc: Mark Rutland Fixes: e0213434fe3e4 ("tracing: Do not let histogram values have some modifiers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 486cca3c2b75..543cb7dc84ad 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4238,13 +4238,19 @@ static int __create_val_field(struct hist_trigger_data *hist_data, goto out; } - /* Some types cannot be a value */ - if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | - HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | - HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | - HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) { - hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); - ret = -EINVAL; + /* values and variables should not have some modifiers */ + if (hist_field->flags & HIST_FIELD_FL_VAR) { + /* Variable */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2)) + goto err; + } else { + /* Value */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | + HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | + HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) + goto err; } hist_data->fields[val_idx] = hist_field; @@ -4256,6 +4262,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data, ret = -EINVAL; out: return ret; + err: + hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); + return -EINVAL; } static int create_val_field(struct hist_trigger_data *hist_data, -- cgit v1.2.3-70-g09d2 From 4b512860bdbdddcf41467ebd394f27cb8dfb528c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 23 May 2023 23:09:13 -0400 Subject: tracing: Rename stacktrace field to common_stacktrace The histogram and synthetic events can use a pseudo event called "stacktrace" that will create a stacktrace at the time of the event and use it just like it was a normal field. We have other pseudo events such as "common_cpu" and "common_timestamp". To stay consistent with that, convert "stacktrace" to "common_stacktrace". As this was used in older kernels, to keep backward compatibility, this will act just like "common_cpu" did with "cpu". That is, "cpu" will be the same as "common_cpu" unless the event has a "cpu" field. In which case, the event's field is used. The same is true with "stacktrace". Also update the documentation to reflect this change. Link: https://lore.kernel.org/linux-trace-kernel/20230523230913.6860e28d@rorschach.local.home Cc: Masami Hiramatsu Cc: Tom Zanussi Cc: Mark Rutland Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/histogram.rst | 64 +++++++++++++++++++-------------------- include/linux/trace_events.h | 1 + kernel/trace/trace.c | 2 +- kernel/trace/trace_events.c | 2 ++ kernel/trace/trace_events_hist.c | 16 ++++++---- 5 files changed, 46 insertions(+), 39 deletions(-) (limited to 'kernel/trace') diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index 479c9eac6335..3c9b263de9c2 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -35,7 +35,7 @@ Documentation written by Tom Zanussi in place of an explicit value field - this is simply a count of event hits. If 'values' isn't specified, an implicit 'hitcount' value will be automatically created and used as the only value. - Keys can be any field, or the special string 'stacktrace', which + Keys can be any field, or the special string 'common_stacktrace', which will use the event's kernel stacktrace as the key. The keywords 'keys' or 'key' can be used to specify keys, and the keywords 'values', 'vals', or 'val' can be used to specify values. Compound @@ -54,7 +54,7 @@ Documentation written by Tom Zanussi 'compatible' if the fields named in the trigger share the same number and type of fields and those fields also have the same names. Note that any two events always share the compatible 'hitcount' and - 'stacktrace' fields and can therefore be combined using those + 'common_stacktrace' fields and can therefore be combined using those fields, however pointless that may be. 'hist' triggers add a 'hist' file to each event's subdirectory. @@ -547,9 +547,9 @@ Extended error information the hist trigger display symbolic call_sites, we can have the hist trigger additionally display the complete set of kernel stack traces that led to each call_site. To do that, we simply use the special - value 'stacktrace' for the key parameter:: + value 'common_stacktrace' for the key parameter:: - # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \ + # echo 'hist:keys=common_stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \ /sys/kernel/tracing/events/kmem/kmalloc/trigger The above trigger will use the kernel stack trace in effect when an @@ -561,9 +561,9 @@ Extended error information every callpath to a kmalloc for a kernel compile):: # cat /sys/kernel/tracing/events/kmem/kmalloc/hist - # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active] + # trigger info: hist:keys=common_stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active] - { stacktrace: + { common_stacktrace: __kmalloc_track_caller+0x10b/0x1a0 kmemdup+0x20/0x50 hidraw_report_event+0x8a/0x120 [hid] @@ -581,7 +581,7 @@ Extended error information cpu_startup_entry+0x315/0x3e0 rest_init+0x7c/0x80 } hitcount: 3 bytes_req: 21 bytes_alloc: 24 - { stacktrace: + { common_stacktrace: __kmalloc_track_caller+0x10b/0x1a0 kmemdup+0x20/0x50 hidraw_report_event+0x8a/0x120 [hid] @@ -596,7 +596,7 @@ Extended error information do_IRQ+0x5a/0xf0 ret_from_intr+0x0/0x30 } hitcount: 3 bytes_req: 21 bytes_alloc: 24 - { stacktrace: + { common_stacktrace: kmem_cache_alloc_trace+0xeb/0x150 aa_alloc_task_context+0x27/0x40 apparmor_cred_prepare+0x1f/0x50 @@ -608,7 +608,7 @@ Extended error information . . . - { stacktrace: + { common_stacktrace: __kmalloc+0x11b/0x1b0 i915_gem_execbuffer2+0x6c/0x2c0 [i915] drm_ioctl+0x349/0x670 [drm] @@ -616,7 +616,7 @@ Extended error information SyS_ioctl+0x81/0xa0 system_call_fastpath+0x12/0x6a } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808 - { stacktrace: + { common_stacktrace: __kmalloc+0x11b/0x1b0 load_elf_phdrs+0x76/0xa0 load_elf_binary+0x102/0x1650 @@ -625,7 +625,7 @@ Extended error information SyS_execve+0x3a/0x50 return_from_execve+0x0/0x23 } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048 - { stacktrace: + { common_stacktrace: kmem_cache_alloc_trace+0xeb/0x150 apparmor_file_alloc_security+0x27/0x40 security_file_alloc+0x16/0x20 @@ -636,7 +636,7 @@ Extended error information SyS_open+0x1e/0x20 system_call_fastpath+0x12/0x6a } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376 - { stacktrace: + { common_stacktrace: __kmalloc+0x11b/0x1b0 seq_buf_alloc+0x1b/0x50 seq_read+0x2cc/0x370 @@ -1026,7 +1026,7 @@ Extended error information First we set up an initially paused stacktrace trigger on the netif_receive_skb event:: - # echo 'hist:key=stacktrace:vals=len:pause' > \ + # echo 'hist:key=common_stacktrace:vals=len:pause' > \ /sys/kernel/tracing/events/net/netif_receive_skb/trigger Next, we set up an 'enable_hist' trigger on the sched_process_exec @@ -1060,9 +1060,9 @@ Extended error information $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz # cat /sys/kernel/tracing/events/net/netif_receive_skb/hist - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] + # trigger info: hist:keys=common_stacktrace:vals=len:sort=hitcount:size=2048 [paused] - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 netif_receive_skb_internal+0x23/0x90 @@ -1079,7 +1079,7 @@ Extended error information kthread+0xd2/0xf0 ret_from_fork+0x42/0x70 } hitcount: 85 len: 28884 - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 netif_receive_skb_internal+0x23/0x90 @@ -1097,7 +1097,7 @@ Extended error information irq_thread+0x11f/0x150 kthread+0xd2/0xf0 } hitcount: 98 len: 664329 - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 process_backlog+0xa8/0x150 @@ -1115,7 +1115,7 @@ Extended error information inet_sendmsg+0x64/0xa0 sock_sendmsg+0x3d/0x50 } hitcount: 115 len: 13030 - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 netif_receive_skb_internal+0x23/0x90 @@ -1142,14 +1142,14 @@ Extended error information into the histogram. In order to avoid having to set everything up again, we can just clear the histogram first:: - # echo 'hist:key=stacktrace:vals=len:clear' >> \ + # echo 'hist:key=common_stacktrace:vals=len:clear' >> \ /sys/kernel/tracing/events/net/netif_receive_skb/trigger Just to verify that it is in fact cleared, here's what we now see in the hist file:: # cat /sys/kernel/tracing/events/net/netif_receive_skb/hist - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] + # trigger info: hist:keys=common_stacktrace:vals=len:sort=hitcount:size=2048 [paused] Totals: Hits: 0 @@ -1485,12 +1485,12 @@ Extended error information And here's an example that shows how to combine histogram data from any two events even if they don't share any 'compatible' fields - other than 'hitcount' and 'stacktrace'. These commands create a + other than 'hitcount' and 'common_stacktrace'. These commands create a couple of triggers named 'bar' using those fields:: - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ + # echo 'hist:name=bar:key=common_stacktrace:val=hitcount' > \ /sys/kernel/tracing/events/sched/sched_process_fork/trigger - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ + # echo 'hist:name=bar:key=common_stacktrace:val=hitcount' > \ /sys/kernel/tracing/events/net/netif_rx/trigger And displaying the output of either shows some interesting if @@ -1501,16 +1501,16 @@ Extended error information # event histogram # - # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active] + # trigger info: hist:name=bar:keys=common_stacktrace:vals=hitcount:sort=hitcount:size=2048 [active] # - { stacktrace: + { common_stacktrace: kernel_clone+0x18e/0x330 kernel_thread+0x29/0x30 kthreadd+0x154/0x1b0 ret_from_fork+0x3f/0x70 } hitcount: 1 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx_ni+0x20/0x70 dev_loopback_xmit+0xaa/0xd0 @@ -1528,7 +1528,7 @@ Extended error information call_cpuidle+0x3b/0x60 cpu_startup_entry+0x22d/0x310 } hitcount: 1 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx_ni+0x20/0x70 dev_loopback_xmit+0xaa/0xd0 @@ -1543,7 +1543,7 @@ Extended error information SyS_sendto+0xe/0x10 entry_SYSCALL_64_fastpath+0x12/0x6a } hitcount: 2 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx+0x1c/0x60 loopback_xmit+0x6c/0xb0 @@ -1561,7 +1561,7 @@ Extended error information sock_sendmsg+0x38/0x50 ___sys_sendmsg+0x14e/0x270 } hitcount: 76 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx+0x1c/0x60 loopback_xmit+0x6c/0xb0 @@ -1579,7 +1579,7 @@ Extended error information sock_sendmsg+0x38/0x50 ___sys_sendmsg+0x269/0x270 } hitcount: 77 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx+0x1c/0x60 loopback_xmit+0x6c/0xb0 @@ -1597,7 +1597,7 @@ Extended error information sock_sendmsg+0x38/0x50 SYSC_sendto+0xef/0x170 } hitcount: 88 - { stacktrace: + { common_stacktrace: kernel_clone+0x18e/0x330 SyS_clone+0x19/0x20 entry_SYSCALL_64_fastpath+0x12/0x6a @@ -1949,7 +1949,7 @@ uninterruptible state:: # cd /sys/kernel/tracing # echo 's:block_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events - # echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 2' >> events/sched/sched_switch/trigger + # echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=common_stacktrace if prev_state == 2' >> events/sched/sched_switch/trigger # echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,s=$st:onmax($delta).trace(block_lat,prev_pid,$delta,$s)' >> events/sched/sched_switch/trigger # echo 1 > events/synthetic/block_lat/enable # cat trace diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 0e373222a6df..7c4a0b72334e 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -806,6 +806,7 @@ enum { FILTER_TRACE_FN, FILTER_COMM, FILTER_CPU, + FILTER_STACKTRACE, }; extern int trace_event_raw_init(struct trace_event_call *call); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebc59781456a..81801dc31784 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5752,7 +5752,7 @@ static const char readme_msg[] = "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" - "\t can be any field, or the special string 'stacktrace'.\n" + "\t can be any field, or the special string 'common_stacktrace'.\n" "\t Compound keys consisting of up to two fields can be specified\n" "\t by the 'keys' keyword. Values must correspond to numeric\n" "\t fields. Sort keys consisting of up to two fields can be\n" diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 654ffa40457a..57e539d47989 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -194,6 +194,8 @@ static int trace_define_generic_fields(void) __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); + __generic_field(char *, stacktrace, FILTER_STACKTRACE); + __generic_field(char *, STACKTRACE, FILTER_STACKTRACE); return ret; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 543cb7dc84ad..b97d3ad832f1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1364,7 +1364,7 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; else - field_name = "stacktrace"; + field_name = "common_stacktrace"; } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; @@ -2367,7 +2367,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->enable_timestamps = true; if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) hist_data->attrs->ts_in_usecs = true; - } else if (strcmp(field_name, "stacktrace") == 0) { + } else if (strcmp(field_name, "common_stacktrace") == 0) { *flags |= HIST_FIELD_FL_STACKTRACE; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; @@ -2378,11 +2378,15 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (!field || !field->size) { /* * For backward compatibility, if field_name - * was "cpu", then we treat this the same as - * common_cpu. This also works for "CPU". + * was "cpu" or "stacktrace", then we treat this + * the same as common_cpu and common_stacktrace + * respectively. This also works for "CPU", and + * "STACKTRACE". */ if (field && field->filter_type == FILTER_CPU) { *flags |= HIST_FIELD_FL_CPU; + } else if (field && field->filter_type == FILTER_STACKTRACE) { + *flags |= HIST_FIELD_FL_STACKTRACE; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); @@ -5394,7 +5398,7 @@ static void hist_trigger_print_key(struct seq_file *m, if (key_field->field) seq_printf(m, "%s.stacktrace", key_field->field->name); else - seq_puts(m, "stacktrace:\n"); + seq_puts(m, "common_stacktrace:\n"); hist_trigger_stacktrace_print(m, key + key_field->offset, HIST_STACKTRACE_DEPTH); @@ -5977,7 +5981,7 @@ static int event_hist_trigger_print(struct seq_file *m, if (field->field) seq_printf(m, "%s.stacktrace", field->field->name); else - seq_puts(m, "stacktrace"); + seq_puts(m, "common_stacktrace"); } else hist_field_print(m, field); } -- cgit v1.2.3-70-g09d2 From e8352cf5778c53b80fdcee086278b2048ddb8f98 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:38 -0400 Subject: tracing: Move setting of tracing_selftest_running out of register_tracer() The variables tracing_selftest_running and tracing_selftest_disabled are only used for when CONFIG_FTRACE_STARTUP_TEST is enabled. Make them only visible within the selftest code. The setting of those variables are in the register_tracer() call, and set in a location where they do not need to be. Create a wrapper around run_tracer_selftest() called do_run_tracer_selftest() which sets those variables, and have register_tracer() call that instead. Having those variables only set within the CONFIG_FTRACE_STARTUP_TEST scope gets rid of them (and also the ability to remove testing against them) when the startup tests are not enabled (most cases). Link: https://lkml.kernel.org/r/20230528051742.1325503-2-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 81801dc31784..87e5920b141f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2041,6 +2041,17 @@ static int run_tracer_selftest(struct tracer *type) return 0; } +static int do_run_tracer_selftest(struct tracer *type) +{ + int ret; + + tracing_selftest_running = true; + ret = run_tracer_selftest(type); + tracing_selftest_running = false; + + return ret; +} + static __init int init_trace_selftests(void) { struct trace_selftests *p, *n; @@ -2092,6 +2103,10 @@ static inline int run_tracer_selftest(struct tracer *type) { return 0; } +static inline int do_run_tracer_selftest(struct tracer *type) +{ + return 0; +} #endif /* CONFIG_FTRACE_STARTUP_TEST */ static void add_tracer_options(struct trace_array *tr, struct tracer *t); @@ -2127,8 +2142,6 @@ int __init register_tracer(struct tracer *type) mutex_lock(&trace_types_lock); - tracing_selftest_running = true; - for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ @@ -2157,7 +2170,7 @@ int __init register_tracer(struct tracer *type) /* store the tracer for __set_tracer_option */ type->flags->trace = type; - ret = run_tracer_selftest(type); + ret = do_run_tracer_selftest(type); if (ret < 0) goto out; @@ -2166,7 +2179,6 @@ int __init register_tracer(struct tracer *type) add_tracer_options(&global_trace, type); out: - tracing_selftest_running = false; mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) -- cgit v1.2.3-70-g09d2 From 9da705d432a07927526005a0688d81fbbf30e349 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:39 -0400 Subject: tracing: Have tracer selftests call cond_resched() before running As there are more and more internal selftests being added to the Linux kernel (KSAN, lockdep, etc) the selftests are taking longer to run when these are enabled. Add a cond_resched() to the calling of do_run_tracer_selftest() to force a schedule if NEED_RESCHED is set, otherwise the soft lockup watchdog may trigger on boot up. Link: https://lkml.kernel.org/r/20230528051742.1325503-3-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87e5920b141f..70f2b511b9cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2045,6 +2045,13 @@ static int do_run_tracer_selftest(struct tracer *type) { int ret; + /* + * Tests can take a long time, especially if they are run one after the + * other, as does happen during bootup when all the tracers are + * registered. This could cause the soft lockup watchdog to trigger. + */ + cond_resched(); + tracing_selftest_running = true; ret = run_tracer_selftest(type); tracing_selftest_running = false; -- cgit v1.2.3-70-g09d2 From a3ae76d7ff781208100e6acc58eb09afb7b4b177 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:40 -0400 Subject: tracing: Make tracing_selftest_running/delete nops when not used There's no reason to test the condition variables tracing_selftest_running or tracing_selftest_delete when tracing selftests are not enabled. Make them define 0s when not the selftests are not configured in. Link: https://lkml.kernel.org/r/20230528051742.1325503-4-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 70f2b511b9cd..004f5f99e943 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -60,6 +60,7 @@ */ bool ring_buffer_expanded; +#ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the @@ -75,7 +76,6 @@ static bool __read_mostly tracing_selftest_running; */ bool __read_mostly tracing_selftest_disabled; -#ifdef CONFIG_FTRACE_STARTUP_TEST void __init disable_tracing_selftest(const char *reason) { if (!tracing_selftest_disabled) { @@ -83,6 +83,9 @@ void __init disable_tracing_selftest(const char *reason) pr_info("Ftrace startup test is disabled due to %s\n", reason); } } +#else +#define tracing_selftest_running 0 +#define tracing_selftest_disabled 0 #endif /* Pipe tracepoints to printk */ -- cgit v1.2.3-70-g09d2 From ac9d2cb1d5f8e22235c399338504dadc87d14e74 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:41 -0400 Subject: tracing: Only make selftest conditionals affect the global_trace The tracing_selftest_running and tracing_selftest_disabled variables were to keep trace_printk() and other writes from affecting the tracing selftests, as the tracing selftests would examine the ring buffer to see if it contained what it expected or not. trace_printk() and friends could add to the ring buffer and cause the selftests to fail (and then disable the tracer that was being tested). To keep that from happening, these variables were added and would keep trace_printk() and friends from writing to the ring buffer while the tests were going on. But this was only the top level ring buffer (owned by the global_trace instance). There is no reason to prevent writing into ring buffers of other instances via the trace_array_printk() and friends. For the functions that could be used by other instances, check if the global_trace is the tracer instance that is being written to before deciding to not allow the write. Link: https://lkml.kernel.org/r/20230528051742.1325503-5-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 004f5f99e943..64a4dde073ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1054,7 +1054,10 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; - if (unlikely(tracing_selftest_running || tracing_disabled)) + if (unlikely(tracing_selftest_running && tr == &global_trace)) + return 0; + + if (unlikely(tracing_disabled)) return 0; alloc = sizeof(*entry) + size + 2; /* possible \n added */ @@ -3512,7 +3515,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, unsigned int trace_ctx; char *tbuffer; - if (tracing_disabled || tracing_selftest_running) + if (tracing_disabled) return 0; /* Don't pollute graph traces with trace_vprintk internals */ @@ -3560,6 +3563,9 @@ __printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { + if (tracing_selftest_running && tr == &global_trace) + return 0; + return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } -- cgit v1.2.3-70-g09d2 From a2d910f02231f33118647fc438157ae69c073f89 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:42 -0400 Subject: tracing: Have function_graph selftest call cond_resched() When all kernel debugging is enabled (lockdep, KSAN, etc), the function graph enabling and disabling can take several seconds to complete. The function_graph selftest enables and disables function graph tracing several times. With full debugging enabled, the soft lockup watchdog was triggering because the selftest was running without ever scheduling. Add cond_resched() throughout the test to make sure it does not trigger the soft lockup detector. Link: https://lkml.kernel.org/r/20230528051742.1325503-6-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_selftest.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a931d9aaea26..529590499b1f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -848,6 +848,12 @@ trace_selftest_startup_function_graph(struct tracer *trace, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* + * These tests can take some time to run. Make sure on non PREEMPT + * kernels, we do not trigger the softlockup detector. + */ + cond_resched(); + tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); @@ -869,6 +875,8 @@ trace_selftest_startup_function_graph(struct tracer *trace, if (ret) goto out; + cond_resched(); + ret = register_ftrace_graph(&fgraph_ops); if (ret) { warn_failed_init_tracer(trace, ret); @@ -891,6 +899,8 @@ trace_selftest_startup_function_graph(struct tracer *trace, if (ret) goto out; + cond_resched(); + tracing_start(); if (!ret && !count) { -- cgit v1.2.3-70-g09d2